In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

In [2]:
train_data = pd.read_csv(r'C:\Users\msi!\OneDrive\Documents\Simi Docs\train_LZdllcl.csv')
test_data = pd.read_csv(r'C:\Users\msi!\OneDrive\Documents\Simi Docs\test_2umaH9m.csv')
sample_submission = pd.read_csv(r'C:\Users\msi!\OneDrive\Documents\Simi Docs\sample_submission_M0L0uXE.csv')

### 1. Exploratory Data Analysis (EDA)

In [3]:
print("Training Data Overview:\n", train_data.head())
print("Test Data Overview:\n", test_data.head())
print("\nTraining Data Info:\n")
train_data.info()
print("\nMissing Values in Training Data:\n", train_data.isnull().sum())
print("\nMissing Values in Test Data:\n", test_data.isnull().sum())

Training Data Overview:
    employee_id         department     region         education gender  \
0        65438  Sales & Marketing   region_7  Master's & above      f   
1        65141         Operations  region_22        Bachelor's      m   
2         7513  Sales & Marketing  region_19        Bachelor's      m   
3         2542  Sales & Marketing  region_23        Bachelor's      m   
4        48945         Technology  region_26        Bachelor's      m   

  recruitment_channel  no_of_trainings  age  previous_year_rating  \
0            sourcing                1   35                   5.0   
1               other                1   30                   5.0   
2            sourcing                1   34                   3.0   
3               other                2   39                   1.0   
4               other                1   45                   3.0   

   length_of_service  KPIs_met >80%  awards_won?  avg_training_score  \
0                  8              1            0 

### 2. Pre-processing for Training Data

In [4]:
train_data['education'].fillna(train_data['education'].mode()[0], inplace=True)
train_data['previous_year_rating'].fillna(train_data['previous_year_rating'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['education'].fillna(train_data['education'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['previous_year_rating'].fillna(train_data['previous_year_rating'].median(), inplace=True)


In [5]:
train_data_encoded = pd.get_dummies(train_data, columns=['department', 'region', 'education', 'gender', 'recruitment_channel'], drop_first=True)

In [6]:
X = train_data_encoded.drop(['employee_id', 'is_promoted'], axis=1)
y = train_data_encoded['is_promoted']

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### 3. Model Training

In [8]:
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
}

In [9]:
best_f1 = 0
best_model = None
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_val_pred)
    print(f"{model_name} F1 Score: {f1}")

RandomForest F1 Score: 0.42924528301886794
GradientBoosting F1 Score: 0.4137353433835846


Parameters: { "use_label_encoder" } are not used.



XGBoost F1 Score: 0.5168195718654435


In [10]:
 if f1 > best_f1:
        best_f1 = f1
        best_model = model

In [11]:
print("\nBest Model:", best_model)
print("Best F1 Score on Validation Set:", best_f1)


Best Model: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=None,
              n_jobs=None, num_parallel_tree=None, random_state=42, ...)
Best F1 Score on Validation Set: 0.5168195718654435


### 4. Hyperparameter Tuning 

In [12]:
if isinstance(best_model, RandomForestClassifier):
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
    }
    grid_search = GridSearchCV(best_model, param_grid, scoring='f1', cv=3)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    print("Tuned Best Model:", best_model)

### 5. Pre-processing for Test Data

In [13]:
test_data['education'].fillna(test_data['education'].mode()[0], inplace=True)
test_data['previous_year_rating'].fillna(test_data['previous_year_rating'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['education'].fillna(test_data['education'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['previous_year_rating'].fillna(test_data['previous_year_rating'].median(), inplace=True)


In [14]:
test_data_encoded = pd.get_dummies(test_data, columns=['department', 'region', 'education', 'gender', 'recruitment_channel'], drop_first=True)
test_data_encoded = test_data_encoded.reindex(columns=X.columns, fill_value=0)

### 6. Prediction and Submission

In [15]:
test_predictions = best_model.predict(test_data_encoded)

In [16]:
submission = sample_submission.copy()
submission['is_promoted'] = test_predictions

In [17]:
submission.to_csv('final_submission.csv', index=False)
print("Submission file saved as 'final_submission.csv'")

Submission file saved as 'final_submission.csv'
