In [1]:
import numpy as np 
import pandas as pd 
import seaborn as snb
import matplotlib.pyplot as plt
%matplotlib inline

import os 
os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # forces tf to run on cpu 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [2]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

train_df = train_df.drop('Segmentation' , axis=1)
train_df.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6
4,462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6


In [3]:
train_df.isnull().sum()

ID                   0
Gender               0
Ever_Married       140
Age                  0
Graduated           78
Profession         124
Work_Experience    829
Spending_Score       0
Family_Size        335
Var_1               76
dtype: int64

In [4]:
# replaces NaN values in numerical columns with the mode of each respective column.
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
col_num = train_df.select_dtypes(include=numerics)

for col in col_num:
    train_df[col].replace([np.nan], train_df[col].mode()[0], inplace=True)

In [5]:
#replaces NaN values in categorical columns with the string 'None'.
categorical = ['object']
cat_columns = train_df.select_dtypes(include=categorical)

for col in cat_columns:
    train_df[col].replace([np.nan], 'None', inplace=True)


In [6]:
train_df.isnull().sum()

ID                 0
Gender             0
Ever_Married       0
Age                0
Graduated          0
Profession         0
Work_Experience    0
Spending_Score     0
Family_Size        0
Var_1              0
dtype: int64

In [7]:
label_list = set(train_df['Var_1'])
label_list

{'Cat_1', 'Cat_2', 'Cat_3', 'Cat_4', 'Cat_5', 'Cat_6', 'Cat_7', 'None'}

In [8]:
train_df.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4
1,462643,Female,Yes,38,Yes,Engineer,1.0,Average,3.0,Cat_4
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6
4,462669,Female,Yes,40,Yes,Entertainment,1.0,High,6.0,Cat_6


In [9]:
def replacing_classes(row: 'object'):
    if row != 'Cat_4' and row != 'Cat_6':
        return 'Other'
    else:
        return row

train=True, 
only_label=False

try:
    # Dropping specified columns
    if train:
        print('dropping columns...')
        train_df.drop(columns=['ID', 'Age'], inplace=True)
    if not train:
        print('dropping ID column')
        train_df.drop(columns=['ID', 'Var_1', 'Age'], inplace=True)

    # Replacing numerical NaNs with mode
    print('replacing numerical NaNs with mode...')
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    col_num = train_df.select_dtypes(include=numerics)
    for col in col_num:
        train_df[col].replace([np.nan], train_df[col].mode()[0], inplace=True)

    # Replacing categorical NaNs with 'None' string
    print('replacing categorical NaNs with None string...')
    categorical = ['object']
    cat_columns = train_df.select_dtypes(include=categorical)
    for col in cat_columns:
        train_df[col].replace([np.nan], 'None', inplace=True)

    # Changing anything other than 'Cat_6' and 'Cat_4' to 'Other'
    if train:
        print('changing anything other than Cat_6 and Cat_4 to Other...')
        # train_df['Var_1'] = train_df['Var_1'].apply(replacing_classes)
        train_df['Var_1'].replace(['Cat_3', 'Cat_2', 'Cat_7', 'Cat_1', 'Cat_5'], ["Other","Other","Other","Other","Other"], inplace=True)
        print(train_df['Var_1'].value_counts())

    # Label encoding categorical data
    print('label encoding categorical data...')
    label_encoder = preprocessing.LabelEncoder()
    columns_to_encode = ['Gender', 'Ever_Married', 'Graduated', 'Profession', 'Spending_Score', 'Var_1']
    for column in columns_to_encode:
        train_df[column] = label_encoder.fit_transform(train_df[column])

    print('\033[1m' + 'SUCCESSFULLY PERFORMED PREPROCESSING' + '\033[0m')
except Exception as e:
    print('error occurred in pre-processing')
    print(e)
    # Return False or handle the error as needed


dropping columns...
replacing numerical NaNs with mode...
replacing categorical NaNs with None string...
changing anything other than Cat_6 and Cat_4 to Other...
Var_1
Cat_6    5238
Other    1665
Cat_4    1089
None       76
Name: count, dtype: int64
label encoding categorical data...
[1mSUCCESSFULLY PERFORMED PREPROCESSING[0m


In [10]:
train_df.head()

Unnamed: 0,Gender,Ever_Married,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
0,1,0,0,5,1.0,2,4.0,0
1,0,2,2,2,1.0,0,3.0,0
2,0,2,2,2,1.0,2,1.0,1
3,1,2,2,7,0.0,1,2.0,1
4,0,2,2,3,1.0,1,6.0,1


In [11]:
# Define your features (X) and target (y)
X = train_df.drop(columns=['Var_1'])
y = train_df['Var_1']

In [12]:
X.head()

Unnamed: 0,Gender,Ever_Married,Graduated,Profession,Work_Experience,Spending_Score,Family_Size
0,1,0,0,5,1.0,2,4.0
1,0,2,2,2,1.0,0,3.0
2,0,2,2,2,1.0,2,1.0
3,1,2,2,7,0.0,1,2.0
4,0,2,2,3,1.0,1,6.0


In [13]:
y.head()

0    0
1    0
2    1
3    1
4    1
Name: Var_1, dtype: int32

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y , shuffle=True)