### Load data

In [41]:
# import libraries

import pandas as pd
import numpy as np


In [42]:
# Load data 

train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
sub = pd.read_csv('sample_submission.csv')

### EDA & preprocessing 

In [43]:
train.shape,test.shape

((8068, 11), (2627, 10))

In [44]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8068 entries, 0 to 8067
Data columns (total 11 columns):
ID                 8068 non-null int64
Gender             8068 non-null object
Ever_Married       7928 non-null object
Age                8068 non-null int64
Graduated          7990 non-null object
Profession         7944 non-null object
Work_Experience    7239 non-null float64
Spending_Score     8068 non-null object
Family_Size        7733 non-null float64
Var_1              7992 non-null object
Segmentation       8068 non-null object
dtypes: float64(2), int64(2), object(7)
memory usage: 693.5+ KB


In [45]:
len(set(test['ID'].unique()).intersection(set(train['ID'].unique())))

2332

In [46]:
testx = pd.merge(test,train,how='left', on = 'ID')

In [47]:
test.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
0,458989,Female,Yes,36,Yes,Engineer,0.0,Low,1.0,Cat_6
1,458994,Male,Yes,37,Yes,Healthcare,8.0,Average,4.0,Cat_6
2,458996,Female,Yes,69,No,,0.0,Low,1.0,Cat_6
3,459000,Male,Yes,59,No,Executive,11.0,High,2.0,Cat_6
4,459001,Female,No,19,No,Marketing,,Low,4.0,Cat_6


In [48]:
test['Var_1'].value_counts()

Cat_6    1672
Cat_4     386
Cat_3     267
Cat_2     141
Cat_7      66
Cat_1      34
Cat_5      29
Name: Var_1, dtype: int64

In [49]:
train['Var_1'].value_counts()

Cat_6    5238
Cat_4    1089
Cat_3     822
Cat_2     422
Cat_7     203
Cat_1     133
Cat_5      85
Name: Var_1, dtype: int64

In [50]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2627 entries, 0 to 2626
Data columns (total 10 columns):
ID                 2627 non-null int64
Gender             2627 non-null object
Ever_Married       2577 non-null object
Age                2627 non-null int64
Graduated          2603 non-null object
Profession         2589 non-null object
Work_Experience    2358 non-null float64
Spending_Score     2627 non-null object
Family_Size        2514 non-null float64
Var_1              2595 non-null object
dtypes: float64(2), int64(2), object(6)
memory usage: 205.4+ KB


In [51]:
# check target distribution in train 

train['Segmentation'].value_counts()

D    2268
A    1972
C    1970
B    1858
Name: Segmentation, dtype: int64

In [52]:
# check sample submission format
sub.head()

Unnamed: 0,ID,Segmentation
0,458989,A
1,458994,A
2,458996,A
3,459000,A
4,459001,A


In [53]:
train.columns

Index(['ID', 'Gender', 'Ever_Married', 'Age', 'Graduated', 'Profession',
       'Work_Experience', 'Spending_Score', 'Family_Size', 'Var_1',
       'Segmentation'],
      dtype='object')

In [54]:
for i in ['Gender' , 'Ever_Married' , 'Graduated','Profession','Spending_Score','Var_1'  ]:
    print(f"column {i} unique values {train[i].unique()}")


column Gender unique values ['Male' 'Female']
column Ever_Married unique values ['No' 'Yes' nan]
column Graduated unique values ['No' 'Yes' nan]
column Profession unique values ['Healthcare' 'Engineer' 'Lawyer' 'Entertainment' 'Artist' 'Executive'
 'Doctor' 'Homemaker' 'Marketing' nan]
column Spending_Score unique values ['Low' 'Average' 'High']
column Var_1 unique values ['Cat_4' 'Cat_6' 'Cat_7' 'Cat_3' 'Cat_1' 'Cat_2' nan 'Cat_5']


In [55]:
# concatenate data into df ,  fill missing values with mode & label encode category values
train['is_train'] = 1
test['is_train'] = 0
df = pd.concat([train,test])





of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


In [56]:
# all blank values replace with mode 
for column in df.columns:
    df[column].fillna(df[column].mode()[0], inplace=True)

In [57]:
from sklearn.preprocessing import LabelEncoder

for i in ['Gender' , 'Ever_Married' , 'Graduated','Profession','Spending_Score','Var_1'  ]:
    le = LabelEncoder()
    df[i] = le.fit_transform(df[i])
    


In [58]:
df.head()

Unnamed: 0,Age,Ever_Married,Family_Size,Gender,Graduated,ID,Profession,Segmentation,Spending_Score,Var_1,Work_Experience,is_train
0,22,0,4.0,1,0,462809,5,D,2,3,1.0,1
1,38,1,3.0,0,1,462643,2,A,0,3,1.0,1
2,67,1,1.0,0,1,466315,2,B,2,5,1.0,1
3,67,1,2.0,1,1,461735,7,B,1,5,0.0,1
4,40,1,6.0,0,1,462669,3,A,1,5,1.0,1


In [59]:
df['Segmentation'] = df['Segmentation'].map({'A':0,'B':1,'C':2,'D':3})

In [60]:
train = df[df['is_train'] == 1]

In [61]:
# split train into 5 folds and apply random forest and check accuracy of each fold

from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
x = train.drop(['Segmentation','is_train','ID'],axis=1)
y = train['Segmentation']
# rf = RandomForestClassifier(n_estimators=50)
rf = xgb.XGBClassifier(objective="multi:softmax",eval_metric="auc",learning_rate =0.1,
                                                         n_estimators=1000,
                                                         max_depth=5,
                                                         min_child_weight=1,
                                                         gamma=0,
                                                         subsample=0.8,
                                                         colsample_bytree=0.8)

kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=10)
acc = []

for fold,(t_id,v_id) in enumerate(kf.split(x,y)):
    tx = x.iloc[t_id]; ty = y.iloc[t_id]
    vx = x.iloc[v_id]; vy = y.iloc[v_id]
    rf.fit(tx,ty)
    val_y = rf.predict(vx)
    acc_score = accuracy_score(vy,val_y)
    acc.append(acc_score)
    print(f"fold {fold} accuracy {acc_score}")

print(f"Mean accuracy score {np.mean(acc_score)}")







fold 0 accuracy 0.5024783147459727
fold 1 accuracy 0.4962825278810409
fold 2 accuracy 0.4857496902106567
fold 3 accuracy 0.4879107253564786
fold 4 accuracy 0.49659020458772474
Mean accuracy score 0.49659020458772474


In [62]:
x.columns

Index(['Age', 'Ever_Married', 'Family_Size', 'Gender', 'Graduated',
       'Profession', 'Spending_Score', 'Var_1', 'Work_Experience'],
      dtype='object')

In [63]:
test = df[df['is_train'] == 0][x.columns]
sub.Segmentation = rf.predict(test)

In [64]:
sub.head()

Unnamed: 0,ID,Segmentation
0,458989,1
1,458994,2
2,458996,2
3,459000,2
4,459001,3


In [65]:
sub.Segmentation = sub.Segmentation.map({0:'A', 1:'B' , 2:'C' ,3:'D'   })

In [66]:
sub.head()

Unnamed: 0,ID,Segmentation
0,458989,B
1,458994,C
2,458996,C
3,459000,C
4,459001,D


In [67]:
sub.head()

Unnamed: 0,ID,Segmentation
0,458989,B
1,458994,C
2,458996,C
3,459000,C
4,459001,D


In [68]:
test.head()

Unnamed: 0,Age,Ever_Married,Family_Size,Gender,Graduated,Profession,Spending_Score,Var_1,Work_Experience
0,36,1,1.0,0,1,2,2,5,0.0
1,37,1,4.0,1,1,5,0,5,8.0
2,69,1,1.0,0,0,0,2,5,0.0
3,59,1,2.0,1,0,4,1,5,11.0
4,19,0,4.0,0,0,8,2,5,1.0


In [69]:
train.head()

Unnamed: 0,Age,Ever_Married,Family_Size,Gender,Graduated,ID,Profession,Segmentation,Spending_Score,Var_1,Work_Experience,is_train
0,22,0,4.0,1,0,462809,5,3,2,3,1.0,1
1,38,1,3.0,0,1,462643,2,0,0,3,1.0,1
2,67,1,1.0,0,1,466315,2,1,2,5,1.0,1
3,67,1,2.0,1,1,461735,7,1,1,5,0.0,1
4,40,1,6.0,0,1,462669,3,0,1,5,1.0,1


In [70]:
sub1 = sub

In [71]:
2332/2600

0.8969230769230769

In [72]:
sub1 = pd.merge(sub1,testx,how='left',on='ID')

In [73]:
sub1.head()

Unnamed: 0,ID,Segmentation_x,Gender_x,Ever_Married_x,Age_x,Graduated_x,Profession_x,Work_Experience_x,Spending_Score_x,Family_Size_x,...,Gender_y,Ever_Married_y,Age_y,Graduated_y,Profession_y,Work_Experience_y,Spending_Score_y,Family_Size_y,Var_1_y,Segmentation_y
0,458989,B,Female,Yes,36,Yes,Engineer,0.0,Low,1.0,...,Female,Yes,42.0,Yes,Engineer,1.0,Low,1.0,Cat_6,B
1,458994,C,Male,Yes,37,Yes,Healthcare,8.0,Average,4.0,...,Male,Yes,38.0,Yes,Healthcare,8.0,Average,4.0,Cat_6,C
2,458996,C,Female,Yes,69,No,,0.0,Low,1.0,...,Female,Yes,71.0,No,,1.0,Low,1.0,Cat_6,A
3,459000,C,Male,Yes,59,No,Executive,11.0,High,2.0,...,Male,Yes,58.0,No,Executive,12.0,High,2.0,Cat_6,C
4,459001,D,Female,No,19,No,Marketing,,Low,4.0,...,Female,No,20.0,No,Marketing,,Low,4.0,Cat_6,C


In [74]:
sub['segmentation2'] = sub1['Segmentation_y']

In [75]:
sub.head()

Unnamed: 0,ID,Segmentation,segmentation2
0,458989,B,B
1,458994,C,C
2,458996,C,A
3,459000,C,C
4,459001,D,C


In [76]:
sub['segmentation2'] = sub['segmentation2'].fillna('x')

In [77]:
for i in range(len(sub)):
    if sub.iloc[i,2] != 'x':
        sub.iloc[i,1] = sub.iloc[i,2]
        

In [78]:
sub[['ID','Segmentation']].to_csv('rf11.csv',index = False)

In [79]:
sub.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2627 entries, 0 to 2626
Data columns (total 3 columns):
ID               2627 non-null int64
Segmentation     2627 non-null object
segmentation2    2627 non-null object
dtypes: int64(1), object(2)
memory usage: 61.7+ KB


In [80]:
if sub.iloc[1,2]:
    print('1')

1


In [81]:
sub.iloc[4,1] ,sub.iloc[4,2]

('C', 'C')