In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
train_df = pd.read_csv('../train_test_files/train.csv')

In [3]:
train_df.head()

Unnamed: 0,response_id,age,gender,years_at_company,job_role,monthly_income,work_life_balance,job_satisfaction,performance_rating,promotions_count,...,dependents_count,job_level,company_size,company_tenure,remote_work,leadership_opportunities,innovation_opportunities,company_reputation,employee_recognition,exit_status
0,8410,31,Male,19,Education,5390.0,Excellent,Medium,Average,2,...,0.0,Mid,Medium,89.0,No,No,No,Excellent,Medium,Stayed
1,64756,59,Female,4,Media,5534.0,Poor,High,Low,3,...,3.0,Mid,Medium,21.0,No,No,No,Fair,Low,Stayed
2,30257,24,Female,10,Healthcare,8159.0,Good,High,Low,0,...,3.0,Mid,Medium,74.0,No,No,No,Poor,Low,Stayed
3,65791,36,Female,7,Education,3989.0,Good,High,High,1,...,2.0,Mid,Small,50.0,Yes,No,No,Good,Medium,Stayed
4,65026,56,Male,41,Education,4821.0,,,Average,0,...,0.0,Senior,Medium,68.0,No,No,No,Fair,Medium,Stayed


In [4]:
train_df.shape

(59611, 24)

In [5]:
train_df.drop_duplicates(inplace=True)

In [6]:
train_df.shape

(59598, 24)

In [7]:
null_value_percentages=(train_df.isna().sum()/train_df.shape[0])*100
null_value_percentages

response_id                  0.000000
age                          0.000000
gender                       0.000000
years_at_company             0.000000
job_role                     0.000000
monthly_income               2.998423
work_life_balance           16.998893
job_satisfaction            11.998725
performance_rating           0.000000
promotions_count             0.000000
overtime                     0.000000
distance_from_home           0.000000
education_level              0.000000
marital_status               0.000000
dependents_count             7.998591
job_level                    0.000000
company_size                 4.998490
company_tenure               6.998557
remote_work                  0.000000
leadership_opportunities     0.000000
innovation_opportunities     0.000000
company_reputation           0.000000
employee_recognition         0.000000
exit_status                  0.000000
dtype: float64

In [8]:
columns_with_null_values = null_value_percentages[null_value_percentages>0]
print(columns_with_null_values)

monthly_income        2.998423
work_life_balance    16.998893
job_satisfaction     11.998725
dependents_count      7.998591
company_size          4.998490
company_tenure        6.998557
dtype: float64


In [9]:
for columns in train_df.columns:
    print(columns , train_df[columns].dtype)

response_id int64
age int64
gender object
years_at_company int64
job_role object
monthly_income float64
work_life_balance object
job_satisfaction object
performance_rating object
promotions_count int64
overtime object
distance_from_home int64
education_level object
marital_status object
dependents_count float64
job_level object
company_size object
company_tenure float64
remote_work object
leadership_opportunities object
innovation_opportunities object
company_reputation object
employee_recognition object
exit_status object


In [10]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [11]:
train_df['exit_status'] = label_encoder.fit_transform(train_df['exit_status'])

In [12]:
for i in columns_with_null_values.index:
    if train_df[i].dtype != object:
        print(i,train_df[i].corr(train_df['exit_status']))

monthly_income 0.011012139419045604
dependents_count 0.07764193757796053
company_tenure 0.030896520388690493


In [13]:
train_df['dependents_count'].fillna(train_df['dependents_count'].mode()[0],inplace=True)

In [14]:
train_df['company_tenure'].fillna(train_df['company_tenure'].mean(),inplace=True)

In [15]:
rows_to_drop=null_value_percentages[(null_value_percentages<5) & (null_value_percentages>0)].sort_values(ascending=False)
rows_to_drop

company_size      4.998490
monthly_income    2.998423
dtype: float64

In [16]:
rows_to_drop=rows_to_drop.keys()
rows_to_drop

Index(['company_size', 'monthly_income'], dtype='object')

In [17]:
for row in rows_to_drop:
	train_df.drop(labels=train_df.index[train_df[row].isna()],inplace=True)

In [18]:
train_df.shape

(56619, 24)

In [19]:
def encode_categorical_column(df, column):
    """
    Encodes a categorical column in the given DataFrame using LabelEncoder,
    while preserving NaN values and keeping the original column name unchanged.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the column to encode.
    column (str): The name of the column to encode.

    Returns:
    pd.DataFrame: The DataFrame with the encoded column.
    """
    # Initialize LabelEncoder
    encoder = LabelEncoder()

    # Fit the encoder on the non-null values only
    encoder.fit(df[column].dropna())

    # Create an array to hold the encoded values
    encoded_values = np.full(df[column].shape, np.nan)  # Create an array filled with NaN

    # Transform non-null values only
    mask = df[column].notna()  # Boolean mask for non-null values
    encoded_values[mask] = encoder.transform(df[column][mask])  # Encode non-null values

    # Update the original column with the encoded values
    df[column] = encoded_values

    return df

In [20]:
train_df = encode_categorical_column(train_df,'work_life_balance')
train_df = encode_categorical_column(train_df,'job_satisfaction')

In [21]:
from sklearn.impute import KNNImputer


In [22]:
imputer = KNNImputer(n_neighbors=5)

In [23]:
train_df[['work_life_balance','job_satisfaction']]=imputer.fit_transform(train_df[['work_life_balance','job_satisfaction']])

In [24]:
for columns in train_df.columns :
    if train_df[columns].dtype==object:
        train_df[columns] = label_encoder.fit_transform(train_df[columns])

In [25]:
train_df.head()

Unnamed: 0,response_id,age,gender,years_at_company,job_role,monthly_income,work_life_balance,job_satisfaction,performance_rating,promotions_count,...,dependents_count,job_level,company_size,company_tenure,remote_work,leadership_opportunities,innovation_opportunities,company_reputation,employee_recognition,exit_status
0,8410,31,1,19,0,5390.0,0.0,2.0,0,2,...,0.0,1,1,89.0,0,0,0,0,2,1
1,64756,59,0,4,3,5534.0,3.0,0.0,3,3,...,3.0,1,1,21.0,0,0,0,1,1,1
2,30257,24,0,10,2,8159.0,2.0,0.0,3,0,...,3.0,1,1,74.0,0,0,0,3,1,1
3,65791,36,0,7,0,3989.0,2.0,0.0,2,1,...,2.0,1,2,50.0,1,0,0,2,2,1
4,65026,56,1,41,0,4821.0,1.478743,1.102561,0,0,...,0.0,2,1,68.0,0,0,0,1,2,1


In [26]:
for columns in train_df.columns:
    print(columns,train_df[columns].corr(train_df['exit_status']))

response_id -0.002183471645753873
age 0.048602294395804616
gender 0.1019788052093233
years_at_company 0.06411002325487215
job_role 0.007676660258304976
monthly_income 0.011216900128775393
work_life_balance -0.05719257886918651
job_satisfaction -0.04433339915174787
performance_rating -0.03236514930345137
promotions_count 0.08192952657415685
overtime -0.05832677444770884
distance_from_home -0.09420386617005841
education_level 0.050213533220125284
marital_status -0.2296170398684206
dependents_count 0.07381869541123076
job_level 0.3148909351567855
company_size -0.028216753870159202
company_tenure 0.030570706976841774
remote_work 0.21907188098342736
leadership_opportunities 0.009105769709558646
innovation_opportunities 0.021670400437270084
company_reputation -0.04040736868648639
employee_recognition 0.00010728095162131245
exit_status 1.0


In [27]:
train_df.drop(['leadership_opportunities','employee_recognition','job_role','response_id'],axis=1,inplace=True)

In [28]:
train_df.shape

(56619, 20)

In [29]:
train_df['dependents_count'].isna().sum()

0

In [30]:
test_df = pd.read_csv('../train_test_files/test.csv')

In [31]:
test_df.head()

Unnamed: 0,response_id,age,gender,years_at_company,job_role,monthly_income,work_life_balance,job_satisfaction,performance_rating,promotions_count,...,marital_status,dependents_count,job_level,company_size,company_tenure,remote_work,leadership_opportunities,innovation_opportunities,company_reputation,employee_recognition
0,52685,36,Male,13,Healthcare,8029.0,Excellent,High,Average,1,...,Married,1.0,Mid,Large,22.0,No,No,No,Poor,Medium
1,30585,35,Male,7,Education,4563.0,Good,High,Average,1,...,Single,4.0,Entry,Medium,27.0,No,No,No,Good,High
2,54656,50,Male,7,Education,5583.0,Fair,High,Average,3,...,Divorced,2.0,Senior,Medium,76.0,No,No,Yes,Good,Low
3,33442,58,Male,44,Media,5525.0,,,High,0,...,Single,4.0,Entry,Medium,96.0,No,No,No,Poor,Low
4,15667,39,Male,24,Education,4604.0,Good,High,Average,0,...,Married,6.0,Mid,Large,45.0,Yes,No,No,Good,High


In [32]:
for columns in test_df.columns :
    if test_df[columns].dtype==object:
        test_df[columns] = label_encoder.fit_transform(test_df[columns])

In [33]:
test_df.head()

Unnamed: 0,response_id,age,gender,years_at_company,job_role,monthly_income,work_life_balance,job_satisfaction,performance_rating,promotions_count,...,marital_status,dependents_count,job_level,company_size,company_tenure,remote_work,leadership_opportunities,innovation_opportunities,company_reputation,employee_recognition
0,52685,36,1,13,2,8029.0,0,0,0,1,...,1,1.0,1,0,22.0,0,0,0,3,2
1,30585,35,1,7,0,4563.0,2,0,0,1,...,2,4.0,0,1,27.0,0,0,0,2,0
2,54656,50,1,7,0,5583.0,1,0,0,3,...,0,2.0,2,1,76.0,0,0,1,2,1
3,33442,58,1,44,3,5525.0,4,4,2,0,...,2,4.0,0,1,96.0,0,0,0,3,1
4,15667,39,1,24,0,4604.0,2,0,0,0,...,1,6.0,1,0,45.0,1,0,0,2,0


In [34]:
null_value_percentages_test=(test_df.isna().sum()/train_df.shape[0])*100
null_value_percentages_test

response_id                 0.000000
age                         0.000000
gender                      0.000000
years_at_company            0.000000
job_role                    0.000000
monthly_income              0.789488
work_life_balance           0.000000
job_satisfaction            0.000000
performance_rating          0.000000
promotions_count            0.000000
overtime                    0.000000
distance_from_home          0.000000
education_level             0.000000
marital_status              0.000000
dependents_count            2.105300
job_level                   0.000000
company_size                0.000000
company_tenure              1.842138
remote_work                 0.000000
leadership_opportunities    0.000000
innovation_opportunities    0.000000
company_reputation          0.000000
employee_recognition        0.000000
dtype: float64

In [35]:
test_df['dependents_count'].fillna(test_df['dependents_count'].mode()[0],inplace=True)

In [36]:
test_df['company_tenure'].fillna(test_df['company_tenure'].mean(),inplace=True)

In [37]:
test_df['monthly_income'].fillna(test_df['monthly_income'].mean(),inplace=True)

In [38]:
null_value_percentages_test=(test_df.isna().sum()/train_df.shape[0])*100
null_value_percentages_test

response_id                 0.0
age                         0.0
gender                      0.0
years_at_company            0.0
job_role                    0.0
monthly_income              0.0
work_life_balance           0.0
job_satisfaction            0.0
performance_rating          0.0
promotions_count            0.0
overtime                    0.0
distance_from_home          0.0
education_level             0.0
marital_status              0.0
dependents_count            0.0
job_level                   0.0
company_size                0.0
company_tenure              0.0
remote_work                 0.0
leadership_opportunities    0.0
innovation_opportunities    0.0
company_reputation          0.0
employee_recognition        0.0
dtype: float64

In [39]:
# from sklearn.ensemble import RandomForestClassifier 
# from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
# from xgboost import XGBClassifier


In [40]:
# model = RandomForestClassifier(random_state=42)
# model_xg = XGBClassifier( random_state=42)

In [41]:
# # param_grid = {
# #     'n_estimators': [100,200],  # Number of trees in the forest
# #     'max_depth': [5,10],  # Maximum depth of the tree
# #     'min_samples_split': [2, 4,8],  # Minimum number of samples required to split a node
# #     'min_samples_leaf': [2, 4],  # Minimum number of samples required to be at a leaf node
# #     'max_features': ['sqrt', 'log2']  # Number of features to consider for the best split
# # }
# param_dist = {
#     'n_estimators': [100,150,200],
#     'max_depth': [5, 10, 15],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['sqrt', 'log2'],
#     'criterion' : ['gini']
# }
# # param_grid_xg={
# #     'n_estimators': [50, 100, 150],
# #     'learning_rate': [0.01, 0.1, 0.2],
# #     'max_depth': [3, 4, 5],
# #     'subsample': [0.8, 1.0],
# #     'colsample_bytree': [0.8, 1.0]
# # } 
# #74.405
# param_grid_xg = {
#     'n_estimators': [100, 200],
#     'learning_rate': [0.05, 0.1],
#     'max_depth': [3, 5, 7],
#     'subsample': [0.8, 1.0],
#     'colsample_bytree': [0.8, 1.0],
#     'gamma': [0, 0.1],
#     'min_child_weight': [1, 3],
#     'reg_alpha': [0, 0.1],
#     'reg_lambda': [1, 2]
# }
# #74.494


In [42]:
# grid_search = GridSearchCV(
#     estimator=model_xg, 
#     param_grid=param_grid_xg, 
#     scoring='f1',  # or another scoring metric you prefer
#     cv=5,                # Number of cross-validation folds
#     verbose=2,           # Level of verbosity
#     n_jobs=-1            # Use all available cores
# )


In [43]:
# grid_search.fit(train_df.drop(['exit_status'],axis=1), train_df['exit_status'])


In [44]:
x = test_df['response_id']
test_df.drop(['leadership_opportunities','employee_recognition','job_role'],axis=1,inplace=True)

In [45]:
# final = grid_search.predict(test_df)

In [46]:
# predictions = ["Stayed" if pred == 1 else "Left" for pred in final]

In [47]:
# output_df = pd.DataFrame({
#     'response_id': x,
#     'Predictions': predictions
# })

In [48]:
# output_df.to_csv('combined_output.csv', index=False)

In [49]:
train_df.to_csv('../train_test_files/processed_train.csv',index=False)

In [50]:
test_df.to_csv('../train_test_files/processed_test.csv',index=False)