In [28]:
#importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier


In [7]:
train_df = pd.read_csv('/content/train_LZdllcl.csv')
test_df = pd.read_csv('/content/test_2umaH9m.csv')

train_df.head()
test_df.head()


Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,8724,Technology,region_26,Bachelor's,m,sourcing,1,24,,1,1,0,77
1,74430,HR,region_4,Bachelor's,f,other,1,31,3.0,5,0,0,51
2,72255,Sales & Marketing,region_13,Bachelor's,m,other,1,31,1.0,4,0,0,47
3,38562,Procurement,region_2,Bachelor's,f,other,3,31,2.0,9,0,0,65
4,64486,Finance,region_29,Bachelor's,m,sourcing,1,30,4.0,7,0,0,61


In [8]:
train_df.shape

(54808, 14)

In [9]:
test_df.shape

(23490, 13)

In [10]:
train_df.isnull().sum()

Unnamed: 0,0
employee_id,0
department,0
region,0
education,2409
gender,0
recruitment_channel,0
no_of_trainings,0
age,0
previous_year_rating,4124
length_of_service,0


In [12]:
# Fill missing education with mode
train_df['education'].fillna(train_df['education'].mode()[0])

Unnamed: 0,education
0,Master's & above
1,Bachelor's
2,Bachelor's
3,Bachelor's
4,Bachelor's
...,...
54803,Bachelor's
54804,Master's & above
54805,Bachelor's
54806,Bachelor's


In [13]:
# Fill missing ratings with median
train_df['previous_year_rating'].fillna(train_df['previous_year_rating'].median())

Unnamed: 0,previous_year_rating
0,5.0
1,5.0
2,3.0
3,1.0
4,3.0
...,...
54803,3.0
54804,2.0
54805,5.0
54806,1.0


In [14]:
test_df.isnull().sum()

Unnamed: 0,0
employee_id,0
department,0
region,0
education,1034
gender,0
recruitment_channel,0
no_of_trainings,0
age,0
previous_year_rating,1812
length_of_service,0


In [15]:
# Fill missing education with mode
test_df['education'].fillna(test_df['education'].mode()[0])

Unnamed: 0,education
0,Bachelor's
1,Bachelor's
2,Bachelor's
3,Bachelor's
4,Bachelor's
...,...
23485,Below Secondary
23486,Bachelor's
23487,Bachelor's
23488,Bachelor's


In [16]:
# Fill missing ratings with median
test_df['previous_year_rating'].fillna(test_df['previous_year_rating'].median())

Unnamed: 0,previous_year_rating
0,3.0
1,3.0
2,1.0
3,2.0
4,4.0
...,...
23485,3.0
23486,3.0
23487,4.0
23488,3.0


In [19]:
#Encode Categorical Variables
combined = pd.concat([train_df.drop('is_promoted', axis=1), test_df], axis=0)
categorical = ['department', 'region', 'education', 'gender', 'recruitment_channel']

combined_encoded = pd.get_dummies(combined, columns=categorical, drop_first=True)

# Split back to train/test
X_train = combined_encoded.iloc[:len(train_df), :]
X_test = combined_encoded.iloc[len(train_df):, :]
y_train = train_df['is_promoted']

In [20]:
#splitting
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

In [21]:
model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
model.fit(X_tr, y_tr)

In [22]:
#Evaluate Model
val_preds = model.predict(X_val)
print("Validation F1 Score:", f1_score(y_val, val_preds))

Validation F1 Score: 0.416597510373444


In [23]:
#Predict on Test Set
test_preds = model.predict(X_test)
submission = pd.DataFrame({
    'employee_id': test_df['employee_id'],
    'is_promoted': test_preds
})
submission.to_csv('submission.csv', index=False)


In [None]:
#Train XGBoost Model
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5,
                          use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_tr, y_tr)

#Evaluating Model
val_preds = xgb_model.predict(X_val)
f1x = f1_score(y_val, val_preds)
print(f"Validation F1 Score: {f1x:.4f}")

#Predict
test_preds = xgb_model.predict(X_test)
submission = pd.DataFrame({
    'employee_id': test_df['employee_id'],
    'is_promoted': test_preds
})

#Submission
submission.to_csv("submission1.csv", index=False)
print("Saved submission.csv")