In [42]:
from datetime import date

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline

In [43]:
data = pd.read_csv('personality_analysis.csv', sep=';')

## Display NaN values

In [44]:
data.isna().sum()

ID                        0
Year_Birth                0
Education                 0
Marital_Status            0
Income                   24
Kidhome                   0
Teenhome                  0
Dt_Customer               0
Recency                   0
MntWines                  0
MntFruits                 0
MntMeatProducts           0
MntFishProducts           0
MntSweetProducts          0
MntGoldProds              0
NumDealsPurchases         0
NumWebPurchases           0
NumCatalogPurchases       0
NumStorePurchases         0
NumWebVisitsMonth         0
AcceptedCmp3              0
AcceptedCmp4              0
AcceptedCmp5              0
AcceptedCmp1              0
AcceptedCmp2              0
Complain                  0
Response                  0
Unnamed: 27            2240
Unnamed: 28            2240
Unnamed: 29            2240
Unnamed: 30            2240
dtype: int64

## Remove the empty columns of the dataset

In [45]:
empty_columns = ['Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29', 'Unnamed: 30']
data = data.drop(empty_columns, axis=1)
data.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response
0,5524,1957,Graduation,Single,58138.0,0,0,4/9/19,58,635,...,10,4,7,0,0,0,0,0,0,1
1,2174,1954,Graduation,Single,46344.0,1,1,8/3/21,38,11,...,1,2,5,0,0,0,0,0,0,0
2,4141,1965,Graduation,Together,71613.0,0,0,21/8/20,26,426,...,2,10,4,0,0,0,0,0,0,0
3,6182,1984,Graduation,Together,26646.0,1,0,10/2/21,26,11,...,0,4,6,0,0,0,0,0,0,0
4,5324,1981,PhD,Married,58293.0,1,0,19/1/21,94,173,...,3,6,5,0,0,0,0,0,0,0


## Removing the rows where income has 'NaN' values.
#### There were 24 out of 2240 rows (0.01% of the total rows) so it is better to drop them than applying methods to fill them

In [47]:
data = data.dropna()
data.isna().sum()

ID                     0
Year_Birth             0
Education              0
Marital_Status         0
Income                 0
Kidhome                0
Teenhome               0
Dt_Customer            0
Recency                0
MntWines               0
MntFruits              0
MntMeatProducts        0
MntFishProducts        0
MntSweetProducts       0
MntGoldProds           0
NumDealsPurchases      0
NumWebPurchases        0
NumCatalogPurchases    0
NumStorePurchases      0
NumWebVisitsMonth      0
AcceptedCmp3           0
AcceptedCmp4           0
AcceptedCmp5           0
AcceptedCmp1           0
AcceptedCmp2           0
Complain               0
Response               0
dtype: int64

### Cast to datetime in order to be compatible with the date format that Pig uses

In [48]:
# data['Dt_Customer'] = pd.to_datetime(data['Dt_Customer'])
# data['Dt_Customer']

## OUTLIER DETECTION
$IQR = Q3 - Q1$

The data points that are $1.5 \cdot IQR$ times further than the min or the max values of their respective $Q1$ and $Q3$ are considered as outliers

#### Income - Boxplot

In [49]:
# import plotly.express as px
# fig = px.box(data, y="Income", points='all')
# fig.show()
import plotly.express as px
df = px.data.tips()
fig = px.box(df, x="time", y="total_bill", points="all")
fig.show()

#### Age - Boxplot

In [50]:
# import plotly.express as px
# fig = px.box(data, y="Age", points='all')
# fig.show()

In [51]:
def outlier_detector(dataframe,feature):
    # Quartile values extraction
    Q3 = dataframe[feature].describe()['75%']
    Q1 = dataframe[feature].describe()['25%']
    IQR = Q3 - Q1

    # Calculation of lower and upper values 
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Calculating the count of outliers for the given feature
    lower_outliers_count = len(dataframe[dataframe[feature] < lower_bound])
    upper_outliers_count = len(dataframe[dataframe[feature] > upper_bound])

    return lower_outliers_count, upper_outliers_count

In [52]:
data.to_csv("personality_analysis_final.csv", sep=";", index=False, encoding='utf-8-sig')