In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

Load Datasets

In [2]:
train = pd.read_csv('/content/drive/MyDrive/MLAL/train_LZdllcl.csv')
test = pd.read_csv('/content/drive/MyDrive/MLAL/test_2umaH9m.csv')
sample_submission = pd.read_csv('/content/drive/MyDrive/MLAL/sample_submission_M0L0uXE.csv')

Exploratory Data Analysis

In [3]:
train.head()
train.info()
train.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54808 entries, 0 to 54807
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   employee_id           54808 non-null  int64  
 1   department            54808 non-null  object 
 2   region                54808 non-null  object 
 3   education             52399 non-null  object 
 4   gender                54808 non-null  object 
 5   recruitment_channel   54808 non-null  object 
 6   no_of_trainings       54808 non-null  int64  
 7   age                   54808 non-null  int64  
 8   previous_year_rating  50684 non-null  float64
 9   length_of_service     54808 non-null  int64  
 10  KPIs_met >80%         54808 non-null  int64  
 11  awards_won?           54808 non-null  int64  
 12  avg_training_score    54808 non-null  int64  
 13  is_promoted           54808 non-null  int64  
dtypes: float64(1), int64(8), object(5)
memory usage: 5.9+ MB


Unnamed: 0,employee_id,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
count,54808.0,54808.0,54808.0,50684.0,54808.0,54808.0,54808.0,54808.0,54808.0
mean,39195.830627,1.253011,34.803915,3.329256,5.865512,0.351974,0.023172,63.38675,0.08517
std,22586.581449,0.609264,7.660169,1.259993,4.265094,0.47759,0.15045,13.371559,0.279137
min,1.0,1.0,20.0,1.0,1.0,0.0,0.0,39.0,0.0
25%,19669.75,1.0,29.0,3.0,3.0,0.0,0.0,51.0,0.0
50%,39225.5,1.0,33.0,3.0,5.0,0.0,0.0,60.0,0.0
75%,58730.5,1.0,39.0,4.0,7.0,1.0,0.0,76.0,0.0
max,78298.0,10.0,60.0,5.0,37.0,1.0,1.0,99.0,1.0


Preprocessing

In [4]:
train.isnull().sum()

Unnamed: 0,0
employee_id,0
department,0
region,0
education,2409
gender,0
recruitment_channel,0
no_of_trainings,0
age,0
previous_year_rating,4124
length_of_service,0


education and previous_year_rating have missing values

In [5]:
test.isnull().sum()

Unnamed: 0,0
employee_id,0
department,0
region,0
education,1034
gender,0
recruitment_channel,0
no_of_trainings,0
age,0
previous_year_rating,1812
length_of_service,0


Fill missing values

In [6]:
train['education'].fillna(train['education'].mode()[0], inplace=True)
test['education'].fillna(test['education'].mode()[0], inplace=True)

train['previous_year_rating'].fillna(train['previous_year_rating'].median(), inplace=True)
test['previous_year_rating'].fillna(test['previous_year_rating'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['education'].fillna(train['education'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['education'].fillna(test['education'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the interm

In [7]:
train.isnull().sum()

Unnamed: 0,0
employee_id,0
department,0
region,0
education,0
gender,0
recruitment_channel,0
no_of_trainings,0
age,0
previous_year_rating,0
length_of_service,0


In [8]:
test.isnull().sum()

Unnamed: 0,0
employee_id,0
department,0
region,0
education,0
gender,0
recruitment_channel,0
no_of_trainings,0
age,0
previous_year_rating,0
length_of_service,0


In [9]:
# Encode categorical variables using one-hot encoding
train = pd.get_dummies(train, columns=['department', 'region', 'education', 'gender', 'recruitment_channel'],drop_first=True)
test_encoded = pd.get_dummies(test, columns=['department', 'region', 'education', 'gender', 'recruitment_channel'],drop_first=True)

In [10]:
train.head()

Unnamed: 0,employee_id,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted,department_Finance,...,region_region_5,region_region_6,region_region_7,region_region_8,region_region_9,education_Below Secondary,education_Master's & above,gender_m,recruitment_channel_referred,recruitment_channel_sourcing
0,65438,1,35,5.0,8,1,0,49,0,False,...,False,False,True,False,False,False,True,False,False,True
1,65141,1,30,5.0,4,0,0,60,0,False,...,False,False,False,False,False,False,False,True,False,False
2,7513,1,34,3.0,7,0,0,50,0,False,...,False,False,False,False,False,False,False,True,False,True
3,2542,2,39,1.0,10,0,0,50,0,False,...,False,False,False,False,False,False,False,True,False,False
4,48945,1,45,3.0,2,0,0,73,0,False,...,False,False,False,False,False,False,False,True,False,False


In [11]:
test_encoded.head()

Unnamed: 0,employee_id,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,department_Finance,department_HR,...,region_region_5,region_region_6,region_region_7,region_region_8,region_region_9,education_Below Secondary,education_Master's & above,gender_m,recruitment_channel_referred,recruitment_channel_sourcing
0,8724,1,24,3.0,1,1,0,77,False,False,...,False,False,False,False,False,False,False,True,False,True
1,74430,1,31,3.0,5,0,0,51,False,True,...,False,False,False,False,False,False,False,False,False,False
2,72255,1,31,1.0,4,0,0,47,False,False,...,False,False,False,False,False,False,False,True,False,False
3,38562,3,31,2.0,9,0,0,65,False,False,...,False,False,False,False,False,False,False,False,False,False
4,64486,1,30,4.0,7,0,0,61,True,False,...,False,False,False,False,False,False,False,True,False,True


In [12]:
# Align test with train (to ensure matching columns)
test_encoded = test_encoded.reindex(columns=train.columns, fill_value=0)

Split Data for Training

In [13]:
X = train.drop(columns=['employee_id', 'is_promoted'])
y = train['is_promoted']

# Modeling

In [14]:
from sklearn.metrics import accuracy_score, f1_score

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
rf_model = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_model.fit(X_train, y_train)

In [18]:
# Validation predictions
y_pred = rf_model.predict(X_val)

In [19]:
# Evaluation
print('Accuracy:', accuracy_score(y_val, y_pred))
print('F1 Score:', f1_score(y_val, y_pred))

Accuracy: 0.9374201787994891
F1 Score: 0.43211920529801323


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

In [21]:
log_model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
log_model.fit(X_train, y_train)
y_pred = log_model.predict(X_val)
print('Accuracy:', accuracy_score(y_val, y_pred))
print('F1 Score:', f1_score(y_val, y_pred))


Accuracy: 0.7700237182995804
F1 Score: 0.377377130155594


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
gradient_model = GradientBoostingClassifier(random_state=42)
gradient_model.fit(X_train, y_train)
y_pred = gradient_model.predict(X_val)
print('Accuracy:', accuracy_score(y_val, y_pred))
print('F1 Score:', f1_score(y_val, y_pred))

Accuracy: 0.9406130268199234
F1 Score: 0.45885286783042395


In [23]:
from xgboost import XGBClassifier

In [24]:
XGB_model = XGBClassifier(random_state=42)
XGB_model.fit(X_train, y_train)
y_pred = XGB_model.predict(X_val)
print('Accuracy:', accuracy_score(y_val, y_pred))
print('F1 Score:', f1_score(y_val, y_pred))

Accuracy: 0.9433497536945813
F1 Score: 0.5059665871121718


In [25]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# GridSearch
grid_search = GridSearchCV(XGB_model,
                           param_grid,cv=3, scoring='f1', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters and model evaluation
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_val)

print('Best Parameters:', grid_search.best_params_)
print('Tuned Accuracy:', accuracy_score(y_val, y_pred_best))
print('Tuned F1 Score:', f1_score(y_val, y_pred_best))


Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}
Tuned Accuracy: 0.942893632548805
Tuned F1 Score: 0.504746835443038


Generate Prediction for Test Data

In [33]:
test_data = pd.get_dummies(test)

In [34]:
test_data = test_data.reindex(columns=X_train.columns, fill_value=0)

In [35]:
test_predictions = best_model.predict(test_data)

In [36]:
test_predictions

array([0, 0, 0, ..., 0, 0, 1])

In [37]:
sample_submission['is_promoted'] = test_predictions
sample_submission.to_csv('submission.csv', index=False)