In [129]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

## Loading Data

Train Data

In [130]:
df_train=pd.read_csv('train.csv')
df_train.head()

Unnamed: 0,ID,Sex,Bachelor,Age,Graduated,Career,Work Experience,Family Expenses,Family Size,Variable,Segmentation,Description
0,1462220581,Male,No,41,Yes,Fashion Designer,2,Low,3,Dog_6,Akshat,The specimen is reliable and experienced.
1,1462220232,Female,Yes,38,Yes,HR,0,Average,2,Dog_6,Akshat,The specimen is reliable and experienced.
2,1462226544,Male,Yes,41,Yes,Fashion Designer,0,High,5,Dog_6,Bhavesh,The specimen is reliable and experienced.
3,1462224171,Male,No,21,No,Doctor,0,Low,4,Dog_6,Darsh,The specimen is responsible and hardworking.
4,1462226427,Female,No,37,Yes,Fashion Designer,0,Low,1,Dog_2,Bhavesh,The specimen is reliable and experienced.


Test Data

In [146]:
df_test=pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,ID,Sex,Bachelor,Age,Graduated,Career,Work Experience,Family Expenses,Family Size,Variable,Description
0,1462224759,Female,Yes,42.0,No,Scientist,4.0,Low,1.0,Dog_6,The specimen is reliable and experienced.
1,1462220081,Female,No,41.0,Yes,Fashion Designer,0.0,Low,1.0,Dog_6,The specimen is reliable and experienced.
2,1462226086,Female,No,18.0,No,Doctor,2.0,Low,5.0,Dog_6,The specimen is rather joyful than responsible.
3,1462224440,Male,No,30.0,Yes,Doctor,1.0,Low,4.0,Dog_6,The specimen is reliable and experienced.
4,1462222905,Male,Yes,81.0,Yes,Lawyer,1.0,Low,1.0,Dog_6,The specimen is reliable and experienced.


## Feature Engineering

In [132]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5647 entries, 0 to 5646
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   ID               5647 non-null   int64 
 1   Sex              5615 non-null   object
 2   Bachelor         5523 non-null   object
 3   Age              5625 non-null   object
 4   Graduated        5578 non-null   object
 5   Career           5534 non-null   object
 6   Work Experience  5074 non-null   object
 7   Family Expenses  5616 non-null   object
 8   Family  Size     5398 non-null   object
 9   Variable         5595 non-null   object
 10  Segmentation     5647 non-null   object
 11  Description      5647 non-null   object
dtypes: int64(1), object(11)
memory usage: 529.5+ KB


Standardize Column Name

In [133]:
df_train.columns=df_train.columns.str.strip()
df_test.columns=df_test.columns.str.strip()

In [134]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5647 entries, 0 to 5646
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   ID               5647 non-null   int64 
 1   Sex              5615 non-null   object
 2   Bachelor         5523 non-null   object
 3   Age              5625 non-null   object
 4   Graduated        5578 non-null   object
 5   Career           5534 non-null   object
 6   Work Experience  5074 non-null   object
 7   Family Expenses  5616 non-null   object
 8   Family  Size     5398 non-null   object
 9   Variable         5595 non-null   object
 10  Segmentation     5647 non-null   object
 11  Description      5647 non-null   object
dtypes: int64(1), object(11)
memory usage: 529.5+ KB


Data Type Conversion

In [135]:
numConvert=['Age','Work Experience','Family  Size']

# Converting to Numeric 
for col in numConvert:
    df_train[col]=pd.to_numeric(df_train[col],errors='coerce') #If 'coerce', then invalid parsing will be set as NaN.
    df_test[col]=pd.to_numeric(df_test[col],errors='coerce') 

In [136]:
df_train.dtypes

ID                   int64
Sex                 object
Bachelor            object
Age                float64
Graduated           object
Career              object
Work Experience    float64
Family Expenses     object
Family  Size       float64
Variable            object
Segmentation        object
Description         object
dtype: object

In [137]:
# Cleaning Family Expenses 
# NAN: None, Avege: Average, L0w:Low, 
temp={'NAN': None, 'Avege': 'Average', 'L0w': 'Low'}
df_train['Family Expenses']=df_train['Family Expenses'].replace(temp)
df_test['Family Expenses']=df_test['Family Expenses'].replace(temp)

In [138]:
# Clean 'Career' column
# Replace '1234', '_', 'Doktor' with NaN or corrected values
df_train['Career'] = df_train['Career'].replace({'1234': None, '_': None, 'Doktor': 'Doctor'})
df_test['Career'] = df_test['Career'].replace({'1234': None, '_': None, 'Doktor': 'Doctor'}) # Apply to test as well

In [139]:
df_train['Career'].unique()

array(['Fashion Designer', 'HR', 'Doctor', 'Lawyer', nan, 'Scientist',
       'Singer', 'Content Creation', 'Housewife', None], dtype=object)

Handling Missing Values

In [140]:
df_train.isnull().count()

ID                 5647
Sex                5647
Bachelor           5647
Age                5647
Graduated          5647
Career             5647
Work Experience    5647
Family Expenses    5647
Family  Size       5647
Variable           5647
Segmentation       5647
Description        5647
dtype: int64

In [141]:
numeric_column=['Age','Work Experience','Family  Size']
cat_column=['Sex','Bachelor','Graduated','Career','Variable','Family Expenses']

for col in numeric_column:
    train_median=df_train[col].median()
    print(col,'=',train_median)
    df_train[col].fillna(train_median,inplace=True)
    df_test[col].fillna(train_median,inplace=True)

for col in cat_column:
    train_mode=df_train[col].mode()[0]
    print(col,'=',train_mode)
    df_train.fillna(train_mode,inplace=True)
    df_test.fillna(train_mode,inplace=True)

Age = 40.0
Work Experience = 1.0
Family  Size = 2.0
Sex = Male
Bachelor = Yes
Graduated = Yes
Career = Fashion Designer
Variable = Dog_6
Family Expenses = Low


Feature Encoding

In [None]:
## Oridnal maping
family_expenses_mapping = {'Low': 0, 'Average': 1, 'High': 2}
df_train['Family Expenses'] = df_train['Family Expenses'].map(family_expenses_mapping)
df_test['Family Expenses'] = df_test['Family Expenses'].map(family_expenses_mapping)

In [144]:
# 2. Drop 'ID' and 'Description' columns
df_train.drop(columns=['ID', 'Description'], inplace=True)
df_test.drop(columns=['ID', 'Description'], inplace=True)

In [154]:
## One Hot encoding
nominal_cols = ['Sex', 'Bachelor', 'Graduated', 'Career', 'Variable']

# Combine train and test for consistent one-hot encoding
# Temporarily drop 'Segmentation' from train_df for concatenation
combined_df = pd.concat([df_train.drop(columns=['Segmentation']), df_test], ignore_index=True)
combined_df = pd.get_dummies(combined_df, columns=nominal_cols, drop_first=True)

# Separate back into train and test
# Use .copy() to avoid SettingWithCopyWarning later
df_train_scaled = combined_df.iloc[:len(df_train)].copy()
df_test_scaled = combined_df.iloc[len(df_train):].copy()

# Add 'Segmentation' back to the training dataframe
df_train_scaled['Segmentation'] = df_train['Segmentation']


# Display info and head of processed dataframes
print("\nProcessed Train DataFrame Info after encoding and dropping columns:")
df_train_scaled.info()
print("\nProcessed Train DataFrame Head after encoding and dropping columns:")
print(df_train_scaled.head())

print("\nProcessed Test DataFrame Info after encoding and dropping columns:")
df_test_scaled.info()
print("\nProcessed Test DataFrame Head after encoding and dropping columns:")
print(df_test_scaled.head())


Processed Train DataFrame Info after encoding and dropping columns:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5647 entries, 0 to 5646
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      5647 non-null   float64
 1   Work Experience          5647 non-null   float64
 2   Family Expenses          5615 non-null   object 
 3   Family  Size             5647 non-null   float64
 4   ID                       0 non-null      float64
 5   Description              0 non-null      object 
 6   Sex_Male                 5647 non-null   bool   
 7   Bachelor_No              5647 non-null   bool   
 8   Bachelor_Yes             5647 non-null   bool   
 9   Graduated_No             5647 non-null   bool   
 10  Graduated_Yes            5647 non-null   bool   
 11  Career_Doctor            5647 non-null   bool   
 12  Career_Fashion Designer  5647 non-null   bool   
 13  Career_HR

In [156]:
from sklearn.preprocessing import StandardScaler

# Identify numerical columns for scaling. These are the ones that were converted to numeric and imputed.
numerical_cols_for_scaling = ['Age', 'Work Experience', 'Family  Size']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler ONLY on the training data's numerical columns and then transform them
df_train_scaled[numerical_cols_for_scaling] = scaler.fit_transform(df_train_scaled[numerical_cols_for_scaling])

# Transform the test data's numerical columns using the scaler fitted on the training data
df_test_scaled[numerical_cols_for_scaling] = scaler.transform(df_test_scaled[numerical_cols_for_scaling])

# Display info and head of processed dataframes after scaling
print("\nFinal Processed Train DataFrame Info:")
df_train_scaled.info()
print("\nFinal Processed Train DataFrame Head:")
print(df_train_scaled.head())

print("\nFinal Processed Test DataFrame Info:")
df_test_scaled.info()
print("\nFinal Processed Test DataFrame Head:")
print(df_test_scaled.head())

# Display value counts of the target variable 'Segmentation' (only in train_processed_df)
print("\nTarget Variable 'Segmentation' Value Counts (Train):")
print(df_train_scaled['Segmentation'].value_counts())


Final Processed Train DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5647 entries, 0 to 5646
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      5647 non-null   float64
 1   Work Experience          5647 non-null   float64
 2   Family Expenses          5615 non-null   object 
 3   Family  Size             5647 non-null   float64
 4   ID                       0 non-null      float64
 5   Description              0 non-null      object 
 6   Sex_Male                 5647 non-null   bool   
 7   Bachelor_No              5647 non-null   bool   
 8   Bachelor_Yes             5647 non-null   bool   
 9   Graduated_No             5647 non-null   bool   
 10  Graduated_Yes            5647 non-null   bool   
 11  Career_Doctor            5647 non-null   bool   
 12  Career_Fashion Designer  5647 non-null   bool   
 13  Career_HR                5647 non-null 

## DataSet Splitting

In [None]:
from sklearn.model_selection import train_test_split
x=df_train_scaled.drop('Segmentation',axis=1)
y=df_train_scaled['Segmentation']

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

Final Dataset

In [159]:
df_train.head()

Unnamed: 0,Sex,Bachelor,Age,Graduated,Career,Work Experience,Family Expenses,Family Size,Variable,Segmentation
0,Male,No,41.0,Yes,Fashion Designer,2.0,0.0,3.0,Dog_6,Akshat
1,Female,Yes,38.0,Yes,HR,0.0,1.0,2.0,Dog_6,Akshat
2,Male,Yes,41.0,Yes,Fashion Designer,0.0,2.0,5.0,Dog_6,Bhavesh
3,Male,No,21.0,No,Doctor,0.0,0.0,4.0,Dog_6,Darsh
4,Female,No,37.0,Yes,Fashion Designer,0.0,0.0,1.0,Dog_2,Bhavesh


In [160]:
df_test.head()

Unnamed: 0,ID,Sex,Bachelor,Age,Graduated,Career,Work Experience,Family Expenses,Family Size,Variable,Description
0,1462224759,Female,Yes,42.0,No,Scientist,4.0,Low,1.0,Dog_6,The specimen is reliable and experienced.
1,1462220081,Female,No,41.0,Yes,Fashion Designer,0.0,Low,1.0,Dog_6,The specimen is reliable and experienced.
2,1462226086,Female,No,18.0,No,Doctor,2.0,Low,5.0,Dog_6,The specimen is rather joyful than responsible.
3,1462224440,Male,No,30.0,Yes,Doctor,1.0,Low,4.0,Dog_6,The specimen is reliable and experienced.
4,1462222905,Male,Yes,81.0,Yes,Lawyer,1.0,Low,1.0,Dog_6,The specimen is reliable and experienced.


In [161]:
df_train_scaled.head()

Unnamed: 0,Age,Work Experience,Family Expenses,Family Size,ID,Description,Sex_Male,Bachelor_No,Bachelor_Yes,Graduated_No,...,Career_Scientist,Career_Singer,Variable_Dog_2,Variable_Dog_3,Variable_Dog_4,Variable_Dog_5,Variable_Dog_6,Variable_Dog_7,Variable_Male,Segmentation
0,-0.146347,-0.048361,0.0,0.046993,,,True,True,False,False,...,False,False,False,False,False,False,True,False,False,Akshat
1,-0.325797,-0.196059,1.0,-0.28184,,,False,False,True,False,...,False,False,False,False,False,False,True,False,False,Akshat
2,-0.146347,-0.196059,2.0,0.704658,,,True,False,True,False,...,False,False,False,False,False,False,True,False,False,Bhavesh
3,-1.342676,-0.196059,0.0,0.375826,,,True,True,False,True,...,False,False,False,False,False,False,True,False,False,Darsh
4,-0.385613,-0.196059,0.0,-0.610673,,,False,True,False,False,...,False,False,True,False,False,False,False,False,False,Bhavesh


In [162]:
df_test_scaled.head()

Unnamed: 0,Age,Work Experience,Family Expenses,Family Size,ID,Description,Sex_Male,Bachelor_No,Bachelor_Yes,Graduated_No,...,Career_Male,Career_Scientist,Career_Singer,Variable_Dog_2,Variable_Dog_3,Variable_Dog_4,Variable_Dog_5,Variable_Dog_6,Variable_Dog_7,Variable_Male
5647,-0.086531,0.099337,Low,-0.610673,1462225000.0,The specimen is reliable and experienced.,False,False,True,True,...,False,True,False,False,False,False,False,True,False,False
5648,-0.146347,-0.196059,Low,-0.610673,1462220000.0,The specimen is reliable and experienced.,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
5649,-1.522125,-0.048361,Low,0.704658,1462226000.0,The specimen is rather joyful than responsible.,False,True,False,True,...,False,False,False,False,False,False,False,True,False,False
5650,-0.804328,-0.12221,Low,0.375826,1462224000.0,The specimen is reliable and experienced.,True,True,False,False,...,False,False,False,False,False,False,False,True,False,False
5651,2.24631,-0.12221,Low,-0.610673,1462223000.0,The specimen is reliable and experienced.,True,False,True,False,...,False,False,False,False,False,False,False,True,False,False


In [163]:
x_train.head()

Unnamed: 0,Age,Work Experience,Family Expenses,Family Size,ID,Description,Sex_Male,Bachelor_No,Bachelor_Yes,Graduated_No,...,Career_Male,Career_Scientist,Career_Singer,Variable_Dog_2,Variable_Dog_3,Variable_Dog_4,Variable_Dog_5,Variable_Dog_6,Variable_Dog_7,Variable_Male
95,-0.206164,-0.196059,0.0,-0.610673,,,False,False,True,False,...,False,True,False,True,False,False,False,False,False,False
5571,0.451817,-0.12221,1.0,0.375826,,,False,False,True,False,...,False,True,False,False,False,False,False,True,False,False
842,-0.684695,0.468583,0.0,0.375826,,,True,True,False,True,...,False,False,True,False,False,False,False,True,False,False
1832,-1.043594,-0.12221,0.0,-0.610673,,,False,True,False,False,...,False,True,False,False,False,False,False,True,False,False
2409,0.870532,-0.196059,1.0,1.033491,,,False,False,True,True,...,False,True,False,False,False,True,False,False,False,False


In [164]:
y_train.head()

95       Akshat
5571    Bhavesh
842      Akshat
1832      Darsh
2409     Akshat
Name: Segmentation, dtype: object

In [165]:
y_train.head()

95       Akshat
5571    Bhavesh
842      Akshat
1832      Darsh
2409     Akshat
Name: Segmentation, dtype: object

In [166]:
y_test.head()

4996    Chaitanya
4351       Akshat
3080        Darsh
2835        Darsh
1197        Darsh
Name: Segmentation, dtype: object

In [168]:
df_train_scaled.to_csv('train_data_final_processed.csv', index=False)
print("Saved 'train_data_final_processed.csv' without the DataFrame index.")

# Save the processed test DataFrame without the index
df_test_scaled.to_csv('test_data_final_processed.csv', index=False)
print("Saved 'test_data_final_processed.csv' without the DataFrame index.")

Saved 'train_data_final_processed.csv' without the DataFrame index.
Saved 'test_data_final_processed.csv' without the DataFrame index.
