In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
df = pd.read_csv('Train.csv')
print(df.shape)
df.head()

# Exploratory Data Analysis

In [None]:
# Histogram - Age
df['Age'].hist()

In [None]:
# Age Vs Spending_Score
sns.scatterplot(data= df, x="Age", y="Spending_Score")

In [None]:
plt.scatter(df["Age"], df["Work_Experience"])

In [None]:
f, ax = plt.subplots(figsize=(12, 6))

sns.countplot(df['Segmentation'],palette = "bright")
#imbalance classes - it is not imbalanced

In [None]:
label=pd.Categorical(df.Segmentation,categories=['A','B','C','D']).codes
sns.countplot(df.Gender,hue=label,palette='tab10')

In [None]:
sns.countplot(df.Ever_Married,hue=label,palette='hls')

In [None]:
sns.countplot(df.Graduated,hue=label,palette='husl')

In [None]:
sns.countplot(df.Profession,hue=label,palette='Paired')

In [None]:
sns.countplot(df.Spending_Score,hue=label,palette='hls')

In [None]:
sns.countplot(df.Work_Experience,hue=label,palette='hls')

In [None]:
sns.countplot(df.Family_Size,hue=label,palette='hls')

In [None]:
f, ax  = plt.subplots(2,2,figsize = (20,10))
colors=["#FF7C00", "#E8000B","#1AC938"]
df['Gender'].value_counts().plot.pie(title='Distribution of Gender',explode=[0,.1],
                                           ax=ax[0][0],autopct="%.2f",shadow = True,colors = colors)

df['Ever_Married'].value_counts().plot.pie(title='Distribution of Ever_Married',explode=[0,.1],
                                                 ax=ax[0][1],autopct="%.2f",shadow = True,colors = colors)

df['Graduated'].value_counts().plot.pie(title='Distribution of Graduated',explode=[0,.1],
                                              ax=ax[1][0],autopct="%.2f",shadow = True,colors = colors)

df['Spending_Score'].value_counts().plot.pie(title='Distribution of Spending_Score',
                                                   ax=ax[1][1],explode=[0,.1,.1],autopct="%.2f",shadow = True,colors=colors)
f.patch.set_facecolor('white')
plt.show()

# Data Preprocessing


In [None]:
# ID column is irrelevent for model so dropping it
df.drop('ID', axis=1, inplace=True)
drop_columns=[]

In [None]:
# Data cleaning before further processing
t = df.isna().sum(axis=1).reset_index()
print(t[0].value_counts())
print('corrupt rows removal percentage:',(sum(t[0]>=2)/df.shape[0])*100)
df = df[t[0]<2]
print(df.shape)

2% of data rows contain 2 or more null values in there features, so we can drop this rows before further preprocessing.

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.head()

In [None]:
# Missing value - Graduate column treatment
tmp = df.groupby(['Profession', 'Graduated']).Gender.count().reset_index().pivot(index='Profession', columns='Graduated', values='Gender')
tmp['flag'] = 'No'
tmp.flag[tmp.Yes>tmp.No]='Yes'

mapping = tmp['flag'].to_dict()
mapping
mapping['No'] = 'No'
mapping['Yes'] = 'Yes'

df.Graduated.fillna(df.Profession,inplace=True)
df.Graduated = df.Graduated.replace(mapping)
df.Graduated.fillna('Yes', inplace=True)
tmp.head()

In [None]:
# Missing value - Family_Size column treatment
tmp = df.groupby('Age').Family_Size.mean().round().reset_index()
tmp.columns=['Age', 'tmp_Family_Size']

df = df.merge(tmp, how='left', on='Age')
df.Family_Size.fillna(df.tmp_Family_Size, inplace=True)
drop_columns.append('tmp_Family_Size')

df.drop('tmp_Family_Size', axis=1, inplace=True)

In [None]:
df.isna().sum()

In [None]:
# Missing value - Ever_Married column treatment
tmp = df[~df.Ever_Married.isna()]

tmp =tmp.groupby(['Age', 'Ever_Married']).Gender.count().reset_index().pivot(index='Age', columns='Ever_Married', values='Gender').reset_index()
tmp['flag'] = (tmp.Yes>tmp.No)*1
tmp.flag = tmp.flag.map({1:'Yes', 0:'No'})
tmp

df= df.merge(tmp[['Age', 'flag']], how='left', on='Age' )
df.Ever_Married.fillna(df.flag,inplace=True)

df.drop('flag', axis=1, inplace=True)

In [None]:
df.isna().sum()

In [None]:
#df.dropna(subset=['Profession', 'Var_1', 'Work_Experience'],inplace=True)
df.dropna(subset=['Profession', 'Var_1'],inplace=True)

In [None]:
df.Gender = df.Gender.map({'Male':1, 'Female':0})
df.Ever_Married = df.Ever_Married.map({'Yes':1, 'No':0})
df.Graduated = df.Graduated.map({'Yes':1, 'No':0})

In [None]:
p = df.Profession.unique()
p = {p:i for i,p in enumerate(p)}
df.Profession = df.Profession.map(p)

In [None]:
# Missing value - Work_Experience column treatment
tmp_train = df[~df['Work_Experience'].isna()]
tmp_test = df[df['Work_Experience'].isna()]

print(tmp_train.shape, tmp_test.shape)

In [None]:
tmp_train_x = tmp_train.drop(['Work_Experience'], axis = 1)
tmp_train_x = tmp_train_x[['Gender', 'Age', 'Graduated', 'Profession']]
tmp_train_y = tmp_train['Work_Experience']

In [None]:
# Reconstructing KNN using best parameters
knn_c = KNeighborsClassifier(n_neighbors=73)
knn_c.fit(tmp_train_x, tmp_train_y)

predicted_y = knn_c.predict(tmp_test[tmp_train_x.columns])

tmp_test['Work_Experience'] = predicted_y

df = pd.concat([tmp_train, tmp_test])
df.Profession = df.Profession.map({j:i for i,j in p.items()})

In [None]:
df.head()

In [None]:
# ddf = pd.get_dummies(df.drop('Var_1', axis=1), columns=['Gender', 'Ever_Married','Graduated', 'Spending_Score', 'Profession'], drop_first=True)
# ddf.head()


# Best score is 53% for this code - age shouldn't be categorical feature
ddf = pd.get_dummies(df.drop('Var_1', axis=1), columns=['Spending_Score', 'Profession'], drop_first=True)
ddf.head()


In [None]:
ddf.shape

In [None]:
le = LabelEncoder()
ddf.Segmentation = le.fit_transform(ddf.Segmentation)
X_train, X_test, y_train, y_test = train_test_split(ddf.drop('Segmentation', axis=1), 
                                                    ddf['Segmentation'],
                                                    test_size=.3,
                                                    random_state =0)

# Modeling

In [None]:
from sklearn.metrics import classification_report

### Model 1: DecisionTreeClassifier 

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)

In [None]:
model_decision_tree = clf.fit(X_train, y_train)

In [None]:
model_decision_tree.score(X_test, y_test)

In [None]:
sklearn.metrics.f1_score

In [None]:
y_pred = model_decision_tree.predict(X_test)

In [None]:
print("To evaluate the performace of train data on the model \n",classification_report(y_train,model_decision_tree.predict(X_train_scalled)))
print("To evaluate the performace of validatation data on the model \n",classification_report(y_test,model_decision_tree.predict(X_test_scalled)))

### Model 2: KNeighborsClassifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()

In [None]:
model_knn = clf.fit(X_train, y_train)

In [None]:
model_knn.score(X_test, y_test)

In [None]:
print("To evaluate the performace of train data on the model \n",classification_report(y_train,model_knn.predict(X_train_scalled)))
print("To evaluate the performace of validatation data on the model \n",classification_report(y_test,model_knn.predict(X_test_scalled)))

### Model 3: BaggingClassifier with GridSearchCV on DecisionTreeClassifier. (Hyperparameter tunning)

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV

bagging = BaggingClassifier(base_estimator=DecisionTreeClassifier())

parameter_bagging = {"n_estimators":[50,100,125],
                  "max_samples":[0.4,0.6,0.8],
                  "max_features":[0.2, 0.4,0.6],
                  "bootstrap":[True,False]
                }

bagging_grid = GridSearchCV(bagging, parameter_bagging)

In [None]:
bagging_grid.fit(X_train,y_train)

In [None]:
bagging_result = pd.DataFrame(bagging_grid.cv_results_).sort_values('rank_test_score')
bagging_result.head()

In [None]:
# accuracy: 51%
print(*bagging_result.params.head(5).to_list(), sep='\n')

### Model 4: XGBClassifier

In [None]:
from xgboost import XGBClassifier

model_xgb=XGBClassifier(learning_rate=0.1,n_jobs=-1,random_state=42,max_depth=15,n_estimators=1000,objective="multi:softproba")
model_xgb.fit(X_train,y_train)


In [None]:
print(model_xgb.score(X_test,y_test))

In [None]:
print("To evaluate the performace of train data on the model \n",classification_report(y_train,model_xgb.predict(X_train_scalled)))
print("To evaluate the performace of validatation data on the model \n",classification_report(y_test,model_xgb.predict(X_test_scalled)))

### Model 5: Support Vector Classifier

In [None]:
from sklearn.preprocessing import StandardScaler
s = StandardScaler()
X_train_scalled = X_train.copy()
X_test_scalled = X_test.copy()
X_train_scalled.loc[:,:] = s.fit_transform(X_train)
X_test_scalled.loc[:,:] = s.transform(X_test)


In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

model_svc = SVC(gamma='auto')

In [None]:
model_svc.fit(X_train_scalled,y_train)

In [None]:
model_svc.score(X_test_scalled, y_test)

In [None]:
print("To evaluate the performace of train data on the model \n",classification_report(y_train,model_svc.predict(X_train_scalled)))
print("To evaluate the performace of validatation data on the model \n",classification_report(y_test,model_svc.predict(X_test_scalled)))