In [1]:

from sklearn.base import TransformerMixin, BaseEstimator

from collections import Counter , defaultdict

import pandas as pd

pd.set_option('display.max_columns', None)


from pandas import Series as s , DataFrame as df
import numpy as np

import seaborn as sns
# import matplotlib.pyplot as plt

# To ignore warnings
import warnings
warnings.filterwarnings("ignore")

from matplotlib import pyplot as plt, rcParams as rc


%matplotlib inline
rc["figure.figsize"] = 10,6

import datetime
import datetime as dt

from sklearn.model_selection  import StratifiedKFold
from sklearn.decomposition import PCA

from iteration_utilities import duplicates, unique_everseen

import sys
from itertools import groupby
from operator import itemgetter
from timeit import timeit


## RandomOverSampler to handle imbalanced data
from imblearn.over_sampling import RandomOverSampler # over sampling method 2

In [2]:

#Algorithm
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.naive_bayes import MultinomialNB

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

#preprocessing
from sklearn.preprocessing import LabelEncoder

from sklearn.preprocessing import StandardScaler, normalize



from sklearn.model_selection import train_test_split

import lightgbm as lgb


#Metrics
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, recall_score, precision_score
from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_recall_fscore_support

from sklearn.pipeline import Pipeline

# GridSearchCV to find optimal min_samples_leaf
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import fbeta_score, make_scorer

In [44]:
df_original = pd.read_csv("Train.csv")

df_test_original = pd.read_csv("Test.csv")
submission = pd.read_csv("sample_submission.csv")

df10 = df.copy()
df_original.shape, df_test_original.shape, submission.shape

((8068, 11), (2627, 10), (2627, 2))

In [45]:
df_original.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A


In [46]:
class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0] if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [47]:
def get_percentage_miss_value(dataset):
    higher_miss_value_column = []
    miss_threshold_value = 50
    
    for i in dataset.columns:
        if dataset[i].isna().sum() > 1: 
            perectange_val = (dataset[i].isna().sum() / len(dataset)) * 100
            print("Column-> " , i, ", total no of missing value : ",dataset[i].isna().sum() , " & :         ", round(perectange_val,2) ," %")
                
            if(perectange_val > miss_threshold_value):
                higher_miss_value_column.append(i)
            
    print("\n\n\n")
    
    if higher_miss_value_column:
        print("Higher Missing values in Columns for Delete : ", higher_miss_value_column)
    else:
        print("There are no Higher Column Missing values in Dataset")

In [48]:
get_percentage_miss_value(df_original)

Column->  Ever_Married , total no of missing value :  140  & :          1.74  %
Column->  Graduated , total no of missing value :  78  & :          0.97  %
Column->  Profession , total no of missing value :  124  & :          1.54  %
Column->  Work_Experience , total no of missing value :  829  & :          10.28  %
Column->  Family_Size , total no of missing value :  335  & :          4.15  %
Column->  Var_1 , total no of missing value :  76  & :          0.94  %




There are no Higher Column Missing values in Dataset


In [50]:
get_percentage_miss_value(df_test_original)

Column->  Ever_Married , total no of missing value :  50  & :          1.9  %
Column->  Graduated , total no of missing value :  24  & :          0.91  %
Column->  Profession , total no of missing value :  38  & :          1.45  %
Column->  Work_Experience , total no of missing value :  269  & :          10.24  %
Column->  Family_Size , total no of missing value :  113  & :          4.3  %
Column->  Var_1 , total no of missing value :  32  & :          1.22  %




There are no Higher Column Missing values in Dataset


In [51]:
df_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8068 entries, 0 to 8067
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               8068 non-null   int64  
 1   Gender           8068 non-null   object 
 2   Ever_Married     7928 non-null   object 
 3   Age              8068 non-null   int64  
 4   Graduated        7990 non-null   object 
 5   Profession       7944 non-null   object 
 6   Work_Experience  7239 non-null   float64
 7   Spending_Score   8068 non-null   object 
 8   Family_Size      7733 non-null   float64
 9   Var_1            7992 non-null   object 
 10  Segmentation     8068 non-null   object 
dtypes: float64(2), int64(2), object(7)
memory usage: 693.5+ KB


In [52]:
def get_categorical_col_name_df(dataset): #testing has been pending
    return dataset.select_dtypes(include=['object']) #get all categorical columns only

In [53]:
get_categorical_col_name_df(df_original).isna().sum()

Gender              0
Ever_Married      140
Graduated          78
Profession        124
Spending_Score      0
Var_1              76
Segmentation        0
dtype: int64

In [54]:
get_categorical_col_name_df(df_original).columns.to_list()

['Gender',
 'Ever_Married',
 'Graduated',
 'Profession',
 'Spending_Score',
 'Var_1',
 'Segmentation']

# <font color ='green' > EDA

In [94]:
df = df_original.copy()

df_test = df_test_original.copy()

In [95]:
df = DataFrameImputer().fit_transform(df.copy())


df_test = DataFrameImputer().fit_transform(df_test.copy())

In [96]:
# get_percentage_miss_value(df)
# get_percentage_miss_value(df_test)

In [97]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8068 entries, 0 to 8067
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               8068 non-null   int64  
 1   Gender           8068 non-null   object 
 2   Ever_Married     8068 non-null   object 
 3   Age              8068 non-null   int64  
 4   Graduated        8068 non-null   object 
 5   Profession       8068 non-null   object 
 6   Work_Experience  8068 non-null   float64
 7   Spending_Score   8068 non-null   object 
 8   Family_Size      8068 non-null   float64
 9   Var_1            8068 non-null   object 
 10  Segmentation     8068 non-null   object 
dtypes: float64(2), int64(2), object(7)
memory usage: 693.5+ KB


In [98]:
df.head(2)

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,462643,Female,Yes,38,Yes,Engineer,2.641663,Average,3.0,Cat_4,A


In [99]:
print("Gender : ", dict(Counter(df.Gender)))

print("Ever_Married : ", dict(Counter(df.Ever_Married)))

print("Graduated : ", dict(Counter(df.Graduated)))

print("Profession : ", dict(Counter(df.Profession)))

print("Spending_Score : ", dict(Counter(df.Spending_Score)))

print("Var_1 : ", dict(Counter(df.Var_1)))

print("Segmentation : ", dict(Counter(df.Segmentation)))



Gender :  {'Male': 4417, 'Female': 3651}
Ever_Married :  {'No': 3285, 'Yes': 4783}
Graduated :  {'No': 3022, 'Yes': 5046}
Profession :  {'Healthcare': 1332, 'Engineer': 699, 'Lawyer': 623, 'Entertainment': 949, 'Artist': 2640, 'Executive': 599, 'Doctor': 688, 'Homemaker': 246, 'Marketing': 292}
Spending_Score :  {'Low': 4878, 'Average': 1974, 'High': 1216}
Var_1 :  {'Cat_4': 1089, 'Cat_6': 5314, 'Cat_7': 203, 'Cat_3': 822, 'Cat_1': 133, 'Cat_2': 422, 'Cat_5': 85}
Segmentation :  {'D': 2268, 'A': 1972, 'B': 1858, 'C': 1970}


In [100]:
dic_Gender =  {'Male': 1, 'Female': 2}
df["Gender"] = df.Gender.apply(lambda x : dic_Gender.get(x))


dic_Ever_Married =  {'Yes': 1, 'No': 2}
df["Ever_Married"] = df.Ever_Married.apply(lambda x : dic_Ever_Married.get(x))


dic_Graduated =  { 'Yes': 1, 'No': 2}
df["Graduated"] = df.Graduated.apply(lambda x : dic_Graduated.get(x))


dic_Profession =  {'Healthcare': 1, 'Engineer': 2, 'Lawyer': 3, 'Entertainment': 4, 'Artist': 5, 
               'Executive': 6, 'Doctor': 7, 'Homemaker': 8, 'Marketing': 9}
df["Profession"] = df.Profession.apply(lambda x : dic_Profession.get(x))


dic_Spending_Score =  {'Low': 1, 'Average': 2, 'High': 3}
df["Spending_Score"] = df.Spending_Score.apply(lambda x : dic_Spending_Score.get(x))


dic_Var_1 =  {'Cat_1': 1, 'Cat_2': 2, 'Cat_3': 3, 'Cat_4': 4,'Cat_5': 5, 'Cat_6': 6, 'Cat_7': 7}
df["Var_1"] = df.Var_1.apply(lambda x : dic_Var_1.get(x))



dic_segmentation = { 'A': 1, 'B': 2, 'C': 3, 'D': 4}
df["Segmentation"] = df.Segmentation.apply(lambda x : dic_segmentation.get(x))







df_test["Gender"] = df_test.Gender.apply(lambda x : dic_Gender.get(x))

df_test["Ever_Married"] = df_test.Ever_Married.apply(lambda x : dic_Ever_Married.get(x))

df_test["Graduated"] = df_test.Graduated.apply(lambda x : dic_Graduated.get(x))

df_test["Profession"] = df_test.Profession.apply(lambda x : dic_Profession.get(x))

df_test["Spending_Score"] = df_test.Spending_Score.apply(lambda x : dic_Spending_Score.get(x))

df_test["Var_1"] = df_test.Var_1.apply(lambda x : dic_Var_1.get(x))


In [101]:
df.head(3)

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,1,2,22,2,1,1.0,1,4.0,4,4
1,462643,2,1,38,1,2,2.641663,2,3.0,4,1
2,466315,2,1,67,1,2,1.0,1,1.0,6,2


In [102]:
df_test.head(3)

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
0,458989,2,1,36,1,2,0.0,1,1.0,6
1,458994,1,1,37,1,1,8.0,2,4.0,6
2,458996,2,1,69,2,5,0.0,1,1.0,6


In [103]:
# X = df4.iloc[:,:-1]
# Y = df4.iloc[:,-1]

feature_variables = ["Gender" , "Ever_Married" , "Graduated", "Spending_Score"]

X = df.loc[:, feature_variables]
Y = df.iloc[:,-1]

df3_test = df_test.loc[:, feature_variables]

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = .2, random_state = 100)

In [104]:
X.head(2)

Unnamed: 0,Gender,Ever_Married,Graduated,Spending_Score
0,1,2,2,1
1,2,1,1,2


In [105]:
df3_test.head(2)

Unnamed: 0,Gender,Ever_Married,Graduated,Spending_Score
0,2,1,1,1
1,1,1,1,2


In [107]:
model_logistic = LogisticRegression()
model_logistic.fit(X, Y)
y_pred_logistic_final = model_logistic.predict(df3_test)


model_xgb = XGBClassifier(max_depth=5, objective='multi:softmax', num_classes=4)
model_xgb.fit(X, Y)
y_pred_xgb_final = model_xgb.predict(df3_test)


model_lgb= LGBMClassifier(n_estimators= 200, objective='multi:softmax', num_classes=3)
model_lgb.fit(X,Y)
y_pred_lgb_final = model_lgb.predict(df3_test)



model_catg= CatBoostClassifier(depth= 3,l2_leaf_reg= 1e-25,learning_rate= 0.07,loss_function= 'MultiClass',n_estimators= 200)
model_catg.fit(X,Y)
y_pred_catg_final = model_catg.predict(df3_test)


Parameters: { num_classes } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


0:	learn: 1.3606362	total: 97.1ms	remaining: 19.3s
1:	learn: 1.3393698	total: 109ms	remaining: 10.7s
2:	learn: 1.3214942	total: 119ms	remaining: 7.8s
3:	learn: 1.3063142	total: 129ms	remaining: 6.32s
4:	learn: 1.2933232	total: 138ms	remaining: 5.39s
5:	learn: 1.2832756	total: 149ms	remaining: 4.82s
6:	learn: 1.2740080	total: 159ms	remaining: 4.38s
7:	learn: 1.2652914	total: 170ms	remaining: 4.07s
8:	learn: 1.2581989	total: 179ms	remaining: 3.81s
9:	learn: 1.2519890	total: 188ms	remaining: 3.57s
10:	learn: 1.2459655	total: 194ms	remaining: 3.34s
11:	learn: 1.2406957	total: 200ms	remaining: 3.13s
12:	learn: 1.2360786	total: 204ms	remaining: 2.93s
13:	learn: 1.2322524	total: 207ms	remaining: 2.75s
14:	learn

191:	learn: 1.1947441	total: 884ms	remaining: 36.8ms
192:	learn: 1.1947407	total: 891ms	remaining: 32.3ms
193:	learn: 1.1947395	total: 895ms	remaining: 27.7ms
194:	learn: 1.1947385	total: 899ms	remaining: 23ms
195:	learn: 1.1947359	total: 908ms	remaining: 18.5ms
196:	learn: 1.1947350	total: 913ms	remaining: 13.9ms
197:	learn: 1.1947302	total: 917ms	remaining: 9.26ms
198:	learn: 1.1947291	total: 919ms	remaining: 4.62ms
199:	learn: 1.1947281	total: 922ms	remaining: 0us


In [111]:
len(y_pred_logistic_final), len(y_pred_xgb_final), len(y_pred_lgb_final), len(y_pred_catg_final)

(2627, 2627, 2627, 2627)

In [114]:
submission["y_pred_logistic_final"] = y_pred_logistic_final

submission["y_pred_xgb_final"] = y_pred_xgb_final

submission["y_pred_lgb_final"] = y_pred_lgb_final

submission["y_pred_catg_final"] = y_pred_catg_final


submission.tail(10)

Unnamed: 0,ID,Segmentation,y_pred_logistic_final,y_pred_xgb_final,y_pred_lgb_final,y_pred_catg_final
2617,467940,A,4,4,4,4
2618,467946,A,3,2,2,2
2619,467948,A,4,4,4,4
2620,467949,A,4,4,4,4
2621,467950,A,4,1,1,1
2622,467954,A,4,4,4,4
2623,467958,A,4,1,1,1
2624,467960,A,4,1,1,1
2625,467961,A,3,3,3,3
2626,467968,A,4,1,1,1


In [119]:
create_submission_file(model_lgb , "lgb_2", df3_test)

       ID  Segmentation
0  458989             2
1  458994             3
2  458996             1
3  459000             2
4  459001             4


'File created successful'

In [120]:
submit_file = save_model_df = pd.read_csv("lgb_2"+'.csv')
submit_file.head()

Unnamed: 0,ID,Segmentation
0,458989,2
1,458994,3
2,458996,1
3,459000,2
4,459001,4


In [121]:
Counter(submit_file.Segmentation)

Counter({2: 339, 3: 704, 1: 844, 4: 740})

In [115]:
def create_submission_file(model, file_name, test_data):
    
    y_pred_final = model.predict(test_data)

    submission_1 = pd.read_csv("sample_submission.csv")
    submission_1["Segmentation"] = y_pred_final
    submission_1.to_csv(file_name+'.csv', index=False)
    
    
    save_model_df = pd.read_csv(file_name+'.csv')
    
    print(save_model_df.head())
    
    
    return "File created successful"

In [13]:
def get_accuracy(y_train_val , y_pred_val , dataset_type = "Default"):
    
    print(" Dataset type is : ", dataset_type)
    
    print("\n Accuracy Score     : ",round(accuracy_score(y_train_val, y_pred_val), 4) * 100)



In [99]:
%%time


lr_clf = LogisticRegression(multi_class='multinomial')
scores = cross_val_score(lr_clf, X, Y, cv=5, scoring=make_scorer(f1_score, average='weighted'), n_jobs=-1)
print('F1_score for LogReg : ', scores.mean())

F1_score for LogReg :  0.4573757935844253
CPU times: user 48.9 ms, sys: 36.6 ms, total: 85.6 ms
Wall time: 1.72 s


In [108]:
%%time

GBC_d = GradientBoostingClassifier()
GBC_d.fit(X_train,y_train)
pred_gbc = GBC_d.predict(X_test)
print(accuracy_score(y_test,pred_gbc))
print(classification_report(y_test,pred_gbc))

0.533457249070632
              precision    recall  f1-score   support

           0       0.43      0.45      0.44       402
           1       0.39      0.33      0.35       365
           2       0.63      0.60      0.61       382
           3       0.64      0.71      0.67       465

    accuracy                           0.53      1614
   macro avg       0.52      0.52      0.52      1614
weighted avg       0.53      0.53      0.53      1614

CPU times: user 2.08 s, sys: 1.35 ms, total: 2.08 s
Wall time: 2.08 s


In [110]:
create_submission_file(GBC_d , "gbc_1", df3_test)

'File created successful'

In [117]:
rows=[]

def addRandomStateForAlgorithm(x,y,names,algorithms,columns_name,random_state_list):    
    for j in range(len(algorithms)):
        model = algorithms[j]
        for i in random_state_list:
            
            x_train, x_test , y_train , y_test = train_test_split(x ,y , test_size = 0.02 , random_state = i)
            
            model.fit(x_train,y_train)
            
            y_pred_train = model.predict(x_train)
            y_pred = model.predict(x_test)
            
            train_acc = accuracy_score(y_train, y_pred_train)
            train_acc = round(train_acc, 4) * 100
            
            test_acc = accuracy_score(y_test, y_pred)
            test_acc = round(test_acc, 4) * 100
            
#             roc_auc_score_acc = roc_auc_score(y_test, y_pred)
#             roc_auc_score_acc = round(roc_auc_score_acc, 4) * 100
            
            row = [names[j],   i,   train_acc, test_acc]
        
    
            rows.append(row)
            
    models_df = pd.DataFrame(rows) 
    
    models_df.columns = columns_name
    print(models_df)
    
    


In [118]:
names = [ "LightGBM", "RF", "XGBoost" , "Logistic"]

algorithms = [ LGBMClassifier(  ), RandomForestClassifier(), XGBClassifier(), LogisticRegression(multi_class='multinomial')]


columns_name = ["Model",    "Random_state",   'Train_acc',     "Test_acc" ]

random_state_list_up_to_3 = [1,2,3]

addRandomStateForAlgorithm(X,Y,names,algorithms,columns_name,random_state_list_up_to_3)

       Model  Random_state  Train_acc  Test_acc
0   LightGBM             1      45.95     42.59
1   LightGBM             2      45.97     41.98
2   LightGBM             3      45.90     45.06
3         RF             1      45.95     42.59
4         RF             2      45.97     41.98
5         RF             3      45.90     45.06
6    XGBoost             1      45.95     42.59
7    XGBoost             2      45.97     41.98
8    XGBoost             3      45.90     45.06
9   Logistic             1      44.08     41.98
10  Logistic             2      44.23     34.57
11  Logistic             3      44.09     41.36
