In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv('/content/Train.csv')

In [4]:
df.describe()

Unnamed: 0,ID,Age,Work_Experience,Family_Size
count,8068.0,8068.0,7239.0,7733.0
mean,463479.214551,43.466906,2.641663,2.850123
std,2595.381232,16.711696,3.406763,1.531413
min,458982.0,18.0,0.0,1.0
25%,461240.75,30.0,0.0,2.0
50%,463472.5,40.0,1.0,3.0
75%,465744.25,53.0,4.0,4.0
max,467974.0,89.0,14.0,9.0


In [12]:
def get_age_outlier_indices(df, factor=1.5):
    if 'Age' not in df.columns:
        raise ValueError("Column 'Age' not found in DataFrame.")

    Q1 = df['Age'].quantile(0.25)
    Q3 = df['Age'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - factor * IQR
    upper_bound = Q3 + factor * IQR

    outlier_indices = df[(df['Age'] < lower_bound) | (df['Age'] > upper_bound)].index.tolist()

    return outlier_indices

In [13]:
outlier_indices = get_age_outlier_indices(df)
print("Outlier indices in 'Age':", outlier_indices)

Outlier indices in 'Age': [321, 395, 699, 753, 943, 1108, 1179, 1213, 1330, 1610, 1642, 1661, 1973, 1998, 2096, 2109, 2140, 2150, 2488, 2531, 2539, 2670, 2724, 2944, 3068, 3259, 3322, 3338, 3446, 3486, 3490, 3523, 3526, 3669, 3745, 3911, 4040, 4139, 4179, 4245, 4284, 4293, 4301, 4303, 4374, 4679, 4858, 4892, 4915, 5059, 5150, 5157, 5391, 5485, 5651, 5791, 5924, 6075, 6137, 6587, 6604, 6855, 6938, 6994, 7176, 7225, 7262, 7778, 7864, 7866, 7920]


In [14]:
def zscore_normalize_age(df):
    if 'Age' not in df.columns:
        raise ValueError("Column 'Age' not found in DataFrame.")

    mean_age = df['Age'].mean()
    std_age = df['Age'].std()

    df['Age_zscore'] = (df['Age'] - mean_age) / std_age

    return df

In [16]:
df = zscore_normalize_age(df)
print(df[['Age', 'Age_zscore']].head())

   Age  Age_zscore
0   22   -1.284544
1   38   -0.327131
2   67    1.408181
3   67    1.408181
4   40   -0.207454


In [17]:
df.drop('Age', axis=1, inplace=True)

In [18]:
df.drop('ID', axis=1, inplace=True)

In [20]:
# Check for duplicates in the entire DataFrame
num_duplicates = df.duplicated().sum()

print(f"Number of duplicate rows: {num_duplicates}")

Number of duplicate rows: 417


In [21]:
# Remove duplicates and keep the first occurrence
df_no_duplicates = df.drop_duplicates(keep='first')

In [23]:
df['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [24]:
encoded_columns = pd.get_dummies(df['Gender'], prefix='Gender')
df = pd.concat([df, encoded_columns], axis=1)
df.drop('Gender', axis=1, inplace=True)

In [26]:
df['Gender_Female'].head()

Unnamed: 0,Gender_Female
0,False
1,True
2,True
3,False
4,True


In [27]:
df['Graduated'].unique()

array(['No', 'Yes', nan], dtype=object)

In [28]:
num_missing = df['Graduated'].isnull().sum()
print(f"Number of missing values in 'Graduated': {num_missing}")

Number of missing values in 'Graduated': 78


In [29]:
# Calculate the mode of the 'Graduated' column
mode_graduated = df['Graduated'].mode()[0]  # [0] to get the first mode (in case of multiple modes)

# Impute missing values with the mode
df['Graduated'].fillna(mode_graduated, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Graduated'].fillna(mode_graduated, inplace=True)


In [30]:
df['Graduated'].unique()

array(['No', 'Yes'], dtype=object)

In [31]:
encoded_columns = pd.get_dummies(df['Graduated'], prefix='Graduated')
df = pd.concat([df, encoded_columns], axis=1)
df.drop('Graduated', axis=1, inplace=True)

In [32]:
df['Graduated_No'].head()

Unnamed: 0,Graduated_No
0,True
1,False
2,False
3,False
4,False


In [34]:
df['Profession'].unique()

array(['Healthcare', 'Engineer', 'Lawyer', 'Entertainment', 'Artist',
       'Executive', 'Doctor', 'Homemaker', 'Marketing', nan], dtype=object)

In [35]:
num_missing = df['Profession'].isnull().sum()
print(f"Number of missing values in 'Profession': {num_missing}")

Number of missing values in 'Profession': 124


In [36]:
# Calculate the mode of the 'Profession' column
mode_profession = df['Profession'].mode()[0]  # [0] to get the first mode (in case of multiple modes)

# Impute missing values with the mode
df['Profession'].fillna(mode_profession, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Profession'].fillna(mode_profession, inplace=True)


In [37]:
df['Profession'].unique()

array(['Healthcare', 'Engineer', 'Lawyer', 'Entertainment', 'Artist',
       'Executive', 'Doctor', 'Homemaker', 'Marketing'], dtype=object)

In [38]:
encoded_columns = pd.get_dummies(df['Profession'], prefix='Profession')
df = pd.concat([df, encoded_columns], axis=1)
df.drop('Profession', axis=1, inplace=True)

In [39]:
df['Profession_Healthcare'].head()

Unnamed: 0,Profession_Healthcare
0,True
1,False
2,False
3,False
4,False


In [41]:
num_missing = df['Age_zscore'].isnull().sum()
print(f"Number of missing values in 'Age_zscore': {num_missing}")

Number of missing values in 'Age_zscore': 0


In [43]:
num_missing = df['Work_Experience'].isnull().sum()
print(f"Number of missing values in 'Work_Experience': {num_missing}")

Number of missing values in 'Work_Experience': 829


In [44]:
max_work_experience = df['Work_Experience'].max()
min_work_experience = df['Work_Experience'].min()

print(f"Maximum Work Experience: {max_work_experience}")
print(f"Minimum Work Experience: {min_work_experience}")

Maximum Work Experience: 14.0
Minimum Work Experience: 0.0


In [45]:
df['Work_Experience'].fillna(df['Work_Experience'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Work_Experience'].fillna(df['Work_Experience'].median(), inplace=True)


In [46]:
df['Work_Experience'].unique()

array([ 1.,  0.,  4.,  9., 12.,  3., 13.,  5.,  8., 14.,  7.,  2.,  6.,
       10., 11.])

In [48]:
scaler = MinMaxScaler()
df['Work_Experience_scaled'] = scaler.fit_transform(df[['Work_Experience']])

In [49]:
df.drop('Work_Experience', axis=1, inplace=True)

In [50]:
df['Spending_Score'].unique()

array(['Low', 'Average', 'High'], dtype=object)

In [51]:
spending_score_mapping = {
    'Low': 0,
    'Average': 1,
    'High': 2
}

df['Spending_Score_Encoded'] = df['Spending_Score'].map(spending_score_mapping)

In [52]:
df.drop('Spending_Score', axis=1, inplace=True)

In [54]:
df['Spending_Score_Encoded'].head()

Unnamed: 0,Spending_Score_Encoded
0,0
1,1
2,0
3,2
4,2


In [56]:
df['Family_Size'].fillna(df['Family_Size'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Family_Size'].fillna(df['Family_Size'].median(), inplace=True)


In [57]:
df['Family_Size'].unique()

array([4., 3., 1., 2., 6., 5., 8., 7., 9.])

In [58]:
df['Family_Size'] = scaler.fit_transform(df[['Family_Size']])

In [59]:
df['Var_1'].unique()

array(['Cat_4', 'Cat_6', 'Cat_7', 'Cat_3', 'Cat_1', 'Cat_2', nan, 'Cat_5'],
      dtype=object)

In [60]:
mode_profession = df['Var_1'].mode()[0]  # [0] to get the first mode (in case of multiple modes)

# Impute missing values with the mode
df['Var_1'].fillna(mode_profession, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Var_1'].fillna(mode_profession, inplace=True)


In [61]:
df['Var_1'].unique()

array(['Cat_4', 'Cat_6', 'Cat_7', 'Cat_3', 'Cat_1', 'Cat_2', 'Cat_5'],
      dtype=object)

In [62]:
encoded_columns = pd.get_dummies(df['Var_1'], prefix='Var_1')
df = pd.concat([df, encoded_columns], axis=1)
df.drop('Var_1', axis=1, inplace=True)

In [63]:
df['Var_1_Cat_1'].head()

Unnamed: 0,Var_1_Cat_1
0,False
1,False
2,False
3,False
4,False


In [65]:
mode_profession = df['Ever_Married'].mode()[0]  # [0] to get the first mode (in case of multiple modes)

# Impute missing values with the mode
df['Ever_Married'].fillna(mode_profession, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Ever_Married'].fillna(mode_profession, inplace=True)


In [66]:
encoded_columns = pd.get_dummies(df['Ever_Married'], prefix='Ever_Married')
df = pd.concat([df, encoded_columns], axis=1)
df.drop('Ever_Married', axis=1, inplace=True)

In [67]:
df['Ever_Married_No'].head()

Unnamed: 0,Ever_Married_No
0,True
1,False
2,False
3,False
4,False


In [70]:
df.to_csv('train_updated_temp.csv', index=False)