In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb

# Load data
train = pd.read_csv('/content/train_LZdllcl.csv')
test = pd.read_csv('/content/test_2umaH9m.csv')
sample_submission = pd.read_csv('/content/sample_submission_M0L0uXE.csv')

# Exploratory Data Analysis (print summaries)
print("Train shape:", train.shape)
print("Test shape:", test.shape)
print(train.head())
print(train.info())

# Check if target column exists
target_column = 'is_promoted'
if target_column not in train.columns:
    raise ValueError(f"Target column '{target_column}' not found in training data.")

# Separate target variable
X = train.drop(columns=[target_column])
y = train[target_column]

# Ensure test has same features as X
test_features = X.columns.intersection(test.columns).tolist()
test = test[X.columns]  # Keep only training features in same order

# Store test IDs for final submission
test_ids = test['UniqueID'] if 'UniqueID' in test.columns else test.index

# Combine for consistent preprocessing
combined = pd.concat([X, test], axis=0, ignore_index=True)

# Fill missing values
combined.fillna(-999, inplace=True)

# Encode categorical features
label_encoders = {}
for col in combined.select_dtypes(include='object').columns:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col].astype(str))
    label_encoders[col] = le

# Split back
X_processed = combined.iloc[:len(X)].copy()
test_processed = combined.iloc[len(X):].copy()

# Split training data for validation
X_train, X_val, y_train, y_val = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
val_preds_rf = rf_model.predict(X_val)
print("Validation Accuracy (Random Forest):", accuracy_score(y_val, val_preds_rf))

# Train XGBoost
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)
val_preds_xgb = xgb_model.predict(X_val)
print("Validation Accuracy (XGBoost):", accuracy_score(y_val, val_preds_xgb))

# Use the better model (you can switch manually)
final_model = xgb_model  # or rf_model

# Predict on test set
final_preds = final_model.predict(test_processed)

# Create submission
submission = sample_submission.copy()

# Ensure the correct column name exists in sample submission
if 'is_promoted' not in submission.columns:
    raise ValueError("Column 'is_promoted' not found in sample submission file.")

submission['is_promoted'] = final_preds
submission.to_csv('final_submission.csv', index=False)
print("✅ Submission file saved as 'final_submission.csv'")

Train shape: (54808, 14)
Test shape: (23490, 13)
   employee_id         department     region         education gender  \
0        65438  Sales & Marketing   region_7  Master's & above      f   
1        65141         Operations  region_22        Bachelor's      m   
2         7513  Sales & Marketing  region_19        Bachelor's      m   
3         2542  Sales & Marketing  region_23        Bachelor's      m   
4        48945         Technology  region_26        Bachelor's      m   

  recruitment_channel  no_of_trainings  age  previous_year_rating  \
0            sourcing                1   35                   5.0   
1               other                1   30                   5.0   
2            sourcing                1   34                   3.0   
3               other                2   39                   1.0   
4               other                1   45                   3.0   

   length_of_service  KPIs_met >80%  awards_won?  avg_training_score  \
0                  8     

Parameters: { "use_label_encoder" } are not used.



Validation Accuracy (XGBoost): 0.9417989417989417
✅ Submission file saved as 'final_submission.csv'


In [2]:
from sklearn.metrics import classification_report

# Generate and print classification report for Random Forest model
print("Classification Report (Random Forest):")
print(classification_report(y_val, val_preds_rf))

# Generate and print classification report for XGBoost model
print("\nClassification Report (XGBoost):")
print(classification_report(y_val, val_preds_xgb))

Classification Report (Random Forest):
              precision    recall  f1-score   support

           0       0.94      1.00      0.97     10054
           1       0.92      0.26      0.40       908

    accuracy                           0.94     10962
   macro avg       0.93      0.63      0.68     10962
weighted avg       0.94      0.94      0.92     10962


Classification Report (XGBoost):
              precision    recall  f1-score   support

           0       0.94      1.00      0.97     10054
           1       0.87      0.35      0.50       908

    accuracy                           0.94     10962
   macro avg       0.91      0.67      0.73     10962
weighted avg       0.94      0.94      0.93     10962



In [13]:
# Create new features
combined['department_region'] = combined['department'].astype(str) + '_' + combined['region'].astype(str)
combined['training_interaction'] = combined['no_of_trainings'] * combined['avg_training_score']
combined['age_group'] = pd.cut(combined['age'], bins=[0, 25, 35, 45, 55, 65, 100], labels=['<25', '25-34', '35-44', '45-54', '55-64', '65+'], right=False).astype(str) # Ensure string type before encoding
combined['service_age_ratio'] = combined['length_of_service'] / combined['age']

# Drop original 'employee_id' column
combined = combined.drop(columns=['employee_id'])

# Re-encode categorical features including the new ones
for col in combined.select_dtypes(include='object').columns:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col].astype(str))
    label_encoders[col] = le # Update the label_encoders dictionary

# Split back
X_processed = combined.iloc[:len(X)].copy()
test_processed = combined.iloc[len(X):].copy()

display(X_processed.head())
display(test_processed.head())

KeyError: "['employee_id'] not found in axis"

In [4]:
print("Missing values in train dataset:")
print(train.isnull().sum())

print("\nMissing values in test dataset:")
print(test.isnull().sum())

# Analyze distribution of columns with missing values
print("\nValue counts for 'education' in train:")
print(train['education'].value_counts(dropna=False))

print("\nValue counts for 'previous_year_rating' in train:")
print(train['previous_year_rating'].value_counts(dropna=False))

print("\nValue counts for 'education' in test:")
print(test['education'].value_counts(dropna=False))

print("\nValue counts for 'previous_year_rating' in test:")
print(test['previous_year_rating'].value_counts(dropna=False))

Missing values in train dataset:
employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

Missing values in test dataset:
employee_id                0
department                 0
region                     0
education               1034
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    1812
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
dtype: int64

Value counts for 'education' in train:
education
Bachelor's          36669
Master's & above    14925
NaN                  24

In [5]:
# Impute 'education' with the mode
train['education'].fillna(train['education'].mode()[0], inplace=True)
test['education'].fillna(test['education'].mode()[0], inplace=True)

# Impute 'previous_year_rating' with the median
train['previous_year_rating'].fillna(train['previous_year_rating'].median(), inplace=True)
test['previous_year_rating'].fillna(test['previous_year_rating'].median(), inplace=True)

# Verify missing values are handled
print("\nMissing values in train dataset after imputation:")
print(train.isnull().sum())

print("\nMissing values in test dataset after imputation:")
print(test.isnull().sum())


Missing values in train dataset after imputation:
employee_id             0
department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
is_promoted             0
dtype: int64

Missing values in test dataset after imputation:
employee_id             0
department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['education'].fillna(train['education'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['education'].fillna(test['education'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the interm

In [17]:
print("X_train shape:", X_train.shape)
print("X_train dtypes:\n", X_train.dtypes)
print("Any NaNs in X_train?", X_train.isnull().values.any())
print("y_train unique values:", y_train.unique())



X_train shape: (43846, 16)
X_train dtypes:
 department                 int64
region                     int64
education                  int64
gender                     int64
recruitment_channel        int64
no_of_trainings            int64
age                        int64
previous_year_rating     float64
length_of_service          int64
KPIs_met >80%              int64
awards_won?                int64
avg_training_score         int64
department_region          int64
training_interaction       int64
age_group               category
service_age_ratio        float64
dtype: object
Any NaNs in X_train? False
y_train unique values: [0 1]


In [18]:
import numpy as np

# Ensure X_train and y_train contain no NaN and are numeric
if X_train.isnull().values.any() or y_train.isnull().values.any():
    raise ValueError("Training data contains missing values.")

# Ensure target is in integer format
y_train = y_train.astype(int)
y_val = y_val.astype(int)

# Convert to numpy arrays (optional but safe for XGBoost)
X_train_np = X_train.values
X_val_np = X_val.values
test_np = test_processed.values

# Fit the XGBoost model
xgb_model = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=1,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=42
)

xgb_model.fit(X_train_np, y_train)
val_preds_xgb = xgb_model.predict(X_val_np)
print("Validation Accuracy (XGBoost):", accuracy_score(y_val, val_preds_xgb))


ValueError: could not convert string to float: '25-34'