In [266]:
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split

# Read the files
train = pd.read_csv("Train.csv")
test = pd.read_csv("Test.csv")

# Add additonal column to differentiate between train and test before merging the two
train['Type'] = 'Train'
test['Type'] = 'Test'

# Combine both train and test data

allData = pd.concat([train, test], axis = 0)

# Data exploration on complete file

allData.columns

Index(['ID', 'Gender', 'Ever_Married', 'Age', 'Graduated', 'Profession',
       'Work_Experience', 'Spending_Score', 'Family_Size', 'Var_1',
       'Segmentation', 'Type'],
      dtype='object')

Review the data

In [267]:
allData.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation,Type
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D,Train
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A,Train
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B,Train
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B,Train
4,462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A,Train


In [268]:
allData.tail()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation,Type
2622,467954,Male,No,29,No,Healthcare,9.0,Low,4.0,Cat_6,,Test
2623,467958,Female,No,35,Yes,Doctor,1.0,Low,1.0,Cat_6,,Test
2624,467960,Female,No,53,Yes,Entertainment,,Low,2.0,Cat_6,,Test
2625,467961,Male,Yes,47,Yes,Executive,1.0,High,5.0,Cat_4,,Test
2626,467968,Female,No,43,Yes,Healthcare,9.0,Low,3.0,Cat_7,,Test


In [269]:
allData.describe()

Unnamed: 0,ID,Age,Work_Experience,Family_Size
count,10695.0,10695.0,9597.0,10247.0
mean,463468.08864,43.511828,2.619777,2.844052
std,2600.966411,16.774158,3.39079,1.536427
min,458982.0,18.0,0.0,1.0
25%,461220.5,30.0,0.0,2.0
50%,463451.0,41.0,1.0,3.0
75%,465733.5,53.0,4.0,4.0
max,467974.0,89.0,14.0,9.0


In [270]:
allData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10695 entries, 0 to 2626
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               10695 non-null  int64  
 1   Gender           10695 non-null  object 
 2   Ever_Married     10505 non-null  object 
 3   Age              10695 non-null  int64  
 4   Graduated        10593 non-null  object 
 5   Profession       10533 non-null  object 
 6   Work_Experience  9597 non-null   float64
 7   Spending_Score   10695 non-null  object 
 8   Family_Size      10247 non-null  float64
 9   Var_1            10587 non-null  object 
 10  Segmentation     8068 non-null   object 
 11  Type             10695 non-null  object 
dtypes: float64(2), int64(2), object(8)
memory usage: 1.1+ MB


In [271]:
allData.dtypes

ID                   int64
Gender              object
Ever_Married        object
Age                  int64
Graduated           object
Profession          object
Work_Experience    float64
Spending_Score      object
Family_Size        float64
Var_1               object
Segmentation        object
Type                object
dtype: object

In [272]:
# Identify the columns with null values
allData.isnull().any()

ID                 False
Gender             False
Ever_Married        True
Age                False
Graduated           True
Profession          True
Work_Experience     True
Spending_Score     False
Family_Size         True
Var_1               True
Segmentation        True
Type               False
dtype: bool

In [273]:
# This gives the count of null values in each column
allData.isna().sum()

ID                    0
Gender                0
Ever_Married        190
Age                   0
Graduated           102
Profession          162
Work_Experience    1098
Spending_Score        0
Family_Size         448
Var_1               108
Segmentation       2627
Type                  0
dtype: int64

In [274]:
target = allData['Segmentation']
type_col = allData['Type']

# Identify the categorical and numerical variables 
num_vars = [var for var in allData.columns if allData.dtypes[var] != 'object']
cat_vars = [var for var in allData.columns if allData.dtypes[var] == 'object']

# Remove ID and segmentation from numerical and categorical variables list as they are mapped separately

num_vars.remove('ID')
cat_vars.remove('Segmentation')
cat_vars.remove('Type')

print ('Numerical variables: \n', num_vars)
print ('Categorical variables: \n', cat_vars)

Numerical variables: 
 ['Age', 'Work_Experience', 'Family_Size']
Categorical variables: 
 ['Gender', 'Ever_Married', 'Graduated', 'Profession', 'Spending_Score', 'Var_1']


#### Replace the nan values for all columns where it exists

In [275]:
# Replace the NaN values in categorical columns with 'Unknown'
allData[cat_vars] = allData[cat_vars].fillna("Unknown")

In [276]:
# Replace the NaN values in numerical columns with 0 
allData[num_vars] = allData[num_vars].fillna(0)

In [277]:
# Check to see if all the null values are gone!
allData.isna().sum()

ID                    0
Gender                0
Ever_Married          0
Age                   0
Graduated             0
Profession            0
Work_Experience       0
Spending_Score        0
Family_Size           0
Var_1                 0
Segmentation       2627
Type                  0
dtype: int64

In [278]:
# Label encode all categorical features 
num = LabelEncoder()
for var in cat_vars:
    allData[var] = num.fit_transform(allData[var].astype("str"))

# Label encode the target variable 'Segmentation'

#allData['Segmentation'] = num.fit_transform(allData['Segmentation'].astype("str"))

In [279]:
# After data preprocessing, overwrite the train and test data based on the new column Type that we created
train = allData[allData['Type'] == 'Train']
# Remove the Type column added
train.drop(['Type'], axis=1)

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,1,0,22,0,5,1.0,2,4.0,3,D
1,462643,0,2,38,2,2,0.0,0,3.0,3,A
2,466315,0,2,67,2,2,1.0,2,1.0,5,B
3,461735,1,2,67,2,7,0.0,1,2.0,5,B
4,462669,0,2,40,2,3,0.0,1,6.0,5,A
...,...,...,...,...,...,...,...,...,...,...,...
8063,464018,1,0,22,0,9,0.0,2,7.0,0,D
8064,464685,1,0,35,0,4,3.0,2,4.0,3,D
8065,465406,0,0,33,2,5,1.0,2,1.0,5,D
8066,467299,0,0,27,2,5,1.0,2,4.0,5,B


In [280]:
# After data preprocessing, overwrite the train and test data based on the new column Type that we created
test = allData[allData['Type'] == 'Test']

# Remove the Type column added
test.drop(['Segmentation'], axis=1)
test.drop(['Type'], axis=1)

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,458989,0,2,36,2,2,0.0,2,1.0,5,
1,458994,1,2,37,2,5,8.0,0,4.0,5,
2,458996,0,2,69,0,9,0.0,2,1.0,5,
3,459000,1,2,59,0,4,11.0,1,2.0,5,
4,459001,0,0,19,0,8,0.0,2,4.0,5,
...,...,...,...,...,...,...,...,...,...,...,...
2622,467954,1,0,29,0,5,9.0,2,4.0,5,
2623,467958,0,0,35,2,1,1.0,2,1.0,5,
2624,467960,0,0,53,2,3,0.0,2,2.0,5,
2625,467961,1,2,47,2,4,1.0,1,5.0,3,


In [281]:
train['Segmentation'].value_counts()

D    2268
A    1972
C    1970
B    1858
Name: Segmentation, dtype: int64

In [282]:
X = train
X.drop(['ID','Type'], axis=1, inplace=True)
y = X['Segmentation']
X.drop(['Segmentation'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [283]:
X.columns

Index(['Gender', 'Ever_Married', 'Age', 'Graduated', 'Profession',
       'Work_Experience', 'Spending_Score', 'Family_Size', 'Var_1'],
      dtype='object')

In [284]:
# Split the training set into training and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [285]:
print('X_train.shape', X_train.shape)
print('y_train.shape', y_train.shape)
print('X_test.shape', X_test.shape)
print('y_test.shape', y_test.shape)
X_train

X_train.shape (6454, 9)
y_train.shape (6454,)
X_test.shape (1614, 9)
y_test.shape (1614,)


Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
6810,1,2,78,0,3,0.0,2,1.0,5
5356,1,0,20,0,5,5.0,2,4.0,2
4969,1,2,63,2,0,1.0,0,0.0,5
678,0,0,18,0,5,1.0,2,3.0,5
7483,1,2,40,2,5,1.0,2,3.0,5
...,...,...,...,...,...,...,...,...,...
3915,1,0,25,2,8,8.0,2,3.0,5
5962,0,0,55,0,3,3.0,2,1.0,5
6624,0,0,22,0,5,1.0,2,8.0,3
2695,1,2,46,2,0,0.0,2,2.0,5


In [287]:
# Use the Random Forest Classifier algorithm
rf = RandomForestClassifier(n_estimators=500,max_depth=2, random_state=0)

In [288]:
from sklearn.metrics import accuracy_score
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(X_test.shape, y_pred.shape)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

(1614, 9) (1614,)
Accuracy: 51.30%


In [289]:
test_id_col = test['ID']
test.drop(['ID','Segmentation', 'Type'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [290]:
print(test.columns)
print(test.shape)

Index(['Gender', 'Ever_Married', 'Age', 'Graduated', 'Profession',
       'Work_Experience', 'Spending_Score', 'Family_Size', 'Var_1'],
      dtype='object')
(2627, 9)


In [291]:
pred = rf.predict(test)
test['Segmentation'] = pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [292]:
test['ID'] = test_id_col
test.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation,ID
0,0,2,36,2,2,0.0,2,1.0,5,A,458989
1,1,2,37,2,5,8.0,0,4.0,5,C,458994
2,0,2,69,0,9,0.0,2,1.0,5,D,458996
3,1,2,59,0,4,11.0,1,2.0,5,C,459000
4,0,0,19,0,8,0.0,2,4.0,5,D,459001


In [293]:
# Copy the output with ID and Segmentation to a CSV file
test.to_csv("final_output.csv", columns=['ID', 'Segmentation'], index=False)