In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.utils import shuffle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
dataset = pd.read_csv('./Dataset.csv')
dataset.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
0,458989,Female,Yes,36,Yes,Engineer,0.0,Low,1.0,Cat_6
1,458994,Male,Yes,37,Yes,Healthcare,8.0,Average,4.0,Cat_6
2,458996,Female,Yes,69,No,,0.0,Low,1.0,Cat_6
3,459000,Male,Yes,59,No,Executive,11.0,High,2.0,Cat_6
4,459001,Female,No,19,No,Marketing,,Low,4.0,Cat_6


In [3]:
print('The shape of the dataset is:', dataset.shape ,'\n')

print('The information about the dataset is:\n')

dataset.info()

The shape of the dataset is: (2627, 10) 

The information about the dataset is:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2627 entries, 0 to 2626
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               2627 non-null   int64  
 1   Gender           2627 non-null   object 
 2   Ever_Married     2577 non-null   object 
 3   Age              2627 non-null   int64  
 4   Graduated        2603 non-null   object 
 5   Profession       2589 non-null   object 
 6   Work_Experience  2358 non-null   float64
 7   Spending_Score   2627 non-null   object 
 8   Family_Size      2514 non-null   float64
 9   Var_1            2595 non-null   object 
dtypes: float64(2), int64(2), object(6)
memory usage: 205.4+ KB


In [63]:
dataset1 = dataset.drop(['ID'], axis =1)
dataset1.head()

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
0,Female,Yes,36,Yes,Engineer,0.0,Low,1.0,Cat_6
1,Male,Yes,37,Yes,Healthcare,8.0,Average,4.0,Cat_6
2,Female,Yes,69,No,,0.0,Low,1.0,Cat_6
3,Male,Yes,59,No,Executive,11.0,High,2.0,Cat_6
4,Female,No,19,No,Marketing,,Low,4.0,Cat_6


### Feature Engineering

In [64]:
dataset1.isnull().sum()

Gender               0
Ever_Married        50
Age                  0
Graduated           24
Profession          38
Work_Experience    269
Spending_Score       0
Family_Size        113
Var_1               32
dtype: int64

In [65]:
##check the Ever_Married and Family_Size column values

EverMarried_Familysize = dataset1[['Ever_Married', 'Family_Size']]
print(EverMarried_Familysize,'\n')

print(EverMarried_Familysize.groupby(['Ever_Married']).mean())


     Ever_Married  Family_Size
0             Yes          1.0
1             Yes          4.0
2             Yes          1.0
3             Yes          2.0
4              No          4.0
...           ...          ...
2622           No          4.0
2623           No          1.0
2624           No          2.0
2625          Yes          5.0
2626           No          3.0

[2627 rows x 2 columns] 

              Family_Size
Ever_Married             
No               2.992979
Yes              2.715453


In [66]:
#conditional imputation
#if the person has ever_married as 'No' then set the Family_Size as 1

dataset1.loc[
    (dataset1['Ever_Married'] =='No') &
    (dataset1['Family_Size'].isnull()), 'Family_Size'
] = 1.0

#then remove the rest of the rows with the Family_Size & Ever_Married as Null.

dataset1 = dataset1.dropna(subset =['Family_Size', 'Ever_Married'])

In [67]:
dataset1.isnull().sum()

Gender               0
Ever_Married         0
Age                  0
Graduated           22
Profession          36
Work_Experience    256
Spending_Score       0
Family_Size          0
Var_1               29
dtype: int64

In [68]:
dataset1.shape

(2526, 9)

In [69]:
##If Profession is known / not null, then populate the Graduate as 'Yes'.

dataset1.loc[
    (dataset1['Profession'].notnull()) &
    (dataset1['Graduated'].isnull()), 'Graduated'
] = 'Yes'

#remove the rest of rows with Profession & Graduated as null.
dataset1= dataset1.dropna(subset=['Profession','Graduated'])

In [70]:
dataset1.isnull().sum()

Gender               0
Ever_Married         0
Age                  0
Graduated            0
Profession           0
Work_Experience    249
Spending_Score       0
Family_Size          0
Var_1               29
dtype: int64

In [71]:
dataset1.shape

(2490, 9)

In [72]:
dataset1=dataset1.dropna(subset=['Work_Experience', 'Var_1'])
dataset1.isnull().sum()

Gender             0
Ever_Married       0
Age                0
Graduated          0
Profession         0
Work_Experience    0
Spending_Score     0
Family_Size        0
Var_1              0
dtype: int64

In [73]:
dataset1.shape

(2215, 9)

In [None]:
#shuffle the training data 

