<a href="https://colab.research.google.com/github/Rashin-Rafeeq/AI_Assignments/blob/main/Third_Intermediate_Assessment_Supervised_Learning(2).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Libraries

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV

Load data

In [9]:
train_df = pd.read_csv('train.csv')
test_df  = pd.read_csv('test.csv')
submission_df = pd.read_csv('sample_submission1.csv')

In [10]:
train_df.describe()


Unnamed: 0,employee_id,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
count,54808.0,54808.0,54808.0,50684.0,54808.0,54808.0,54808.0,54808.0,54808.0
mean,39195.830627,1.253011,34.803915,3.329256,5.865512,0.351974,0.023172,63.38675,0.08517
std,22586.581449,0.609264,7.660169,1.259993,4.265094,0.47759,0.15045,13.371559,0.279137
min,1.0,1.0,20.0,1.0,1.0,0.0,0.0,39.0,0.0
25%,19669.75,1.0,29.0,3.0,3.0,0.0,0.0,51.0,0.0
50%,39225.5,1.0,33.0,3.0,5.0,0.0,0.0,60.0,0.0
75%,58730.5,1.0,39.0,4.0,7.0,1.0,0.0,76.0,0.0
max,78298.0,10.0,60.0,5.0,37.0,1.0,1.0,99.0,1.0


In [11]:
train_df.isnull().sum()


Unnamed: 0,0
employee_id,0
department,0
region,0
education,2409
gender,0
recruitment_channel,0
no_of_trainings,0
age,0
previous_year_rating,4124
length_of_service,0


In [12]:
train_df['is_promoted'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
is_promoted,Unnamed: 1_level_1
0,0.91483
1,0.08517


In [13]:
def create_features(df):
    df = df.copy()

    # 1. KPIs and awards interaction (strong signal)
    df['kpi_award'] = df['KPIs_met >80%'] * df['awards_won?']

    # 2. Rating × KPIs
    df['rating_kpi'] = df['previous_year_rating'] * df['KPIs_met >80%']

    # 3. Training score relative to department average
    dept_avg = df.groupby('department')['avg_training_score'].transform('mean')
    df['score_vs_dept'] = df['avg_training_score'] / (dept_avg + 1e-6)

    # 4. Age / service ratio (experience maturity)
    df['age_service_ratio'] = df['age'] / (df['length_of_service'] + 1)

    # 5. Training effort × performance
    df['train_effort_score'] = df['no_of_trainings'] * df['avg_training_score']

    # 6. Binned training score
    df['score_bin'] = pd.cut(df['avg_training_score'],
                             bins=[0, 50, 60, 70, 80, 90, 101],
                             labels=[0,1,2,3,4,5], include_lowest=True)

    return df

train_df = create_features(train_df)
test_df  = create_features(test_df)

Pre-processing

In [None]:
imputer_rating = SimpleImputer(strategy='median')
train_df['previous_year_rating'] = imputer_rating.fit_transform(train_df[['previous_year_rating']]).ravel()
test_df['previous_year_rating']  = imputer_rating.transform(test_df[['previous_year_rating']]).ravel()

imputer_edu = SimpleImputer(strategy='most_frequent')
train_df['education'] = imputer_edu.fit_transform(train_df[['education']]).ravel()
test_df['education']  = imputer_edu.transform(test_df[['education']]).ravel()

# Label Encoding
cat_cols = ['department', 'region', 'education', 'gender', 'recruitment_channel']

for col in cat_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col].astype(str))
    # Handle unseen categories in test
    test_df[col] = test_df[col].astype(str).map(lambda s: '<unknown>' if s not in le.classes_ else s)
    le.classes_ = np.append(le.classes_, '<unknown>')
    test_df[col] = le.transform(test_df[col])

# Features list (now includes new ones)
features = [
    'department', 'region', 'education', 'gender', 'recruitment_channel',
    'no_of_trainings', 'age', 'previous_year_rating', 'length_of_service',
    'KPIs_met >80%', 'awards_won?', 'avg_training_score',
    'kpi_award', 'rating_kpi', 'score_vs_dept', 'age_service_ratio',
    'train_effort_score', 'score_bin'
]

X = train_df[features]
y = train_df['is_promoted']

# Scaling numerical columns
num_cols = ['no_of_trainings', 'age', 'previous_year_rating', 'length_of_service',
            'avg_training_score', 'score_vs_dept', 'age_service_ratio', 'train_effort_score']

scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# ────────────────────────────────────────────────
# LightGBM Model with Randomized Search
# ────────────────────────────────────────────────
lgb_params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'is_unbalance': True,
    'boosting_type': 'gbdt',
    'random_state': 42,
    'verbosity': -1
}

model = lgb.LGBMClassifier(**lgb_params)

param_dist = {
    'n_estimators': [200, 400, 600, 800],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'num_leaves': [21, 31, 41, 51],
    'max_depth': [5, 7, 9, -1],
    'min_child_samples': [20, 40, 60],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
}

random_search = RandomizedSearchCV(
    model, param_distributions=param_dist,
    n_iter=20, cv=5, scoring='f1', random_state=42,
    n_jobs=-1, verbose=1
)

random_search.fit(X, y)

print("Best parameters:", random_search.best_params_)
print("Best CV F1 score:", random_search.best_score_)

best_model = random_search.best_estimator_

# ────────────────────────────────────────────────
# Threshold tuning on OOF predictions (optional but recommended)
# ────────────────────────────────────────────────
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(y))

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_tr, X_va = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]

    best_model.fit(X_tr, y_tr)
    oof_preds[val_idx] = best_model.predict_proba(X_va)[:, 1]

# Find best threshold
best_thresh, best_f1 = 0.5, 0
for thresh in np.arange(0.30, 0.70, 0.005):
    pred = (oof_preds >= thresh).astype(int)
    f1 = f1_score(y, pred)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = thresh

print(f"Best threshold: {best_thresh:.3f} → CV F1: {best_f1:.4f}")

# ────────────────────────────────────────────────
# Final prediction on test set
# ────────────────────────────────────────────────
X_test = test_df[features]
X_test[num_cols] = scaler.transform(X_test[num_cols])

test_proba = best_model.predict_proba(X_test)[:, 1]
test_pred = (test_proba >= best_thresh).astype(int)

# ────────────────────────────────────────────────
# Create submission
# ────────────────────────────────────────────────
submission_df['is_promoted'] = test_pred
submission_df.to_csv('submission_improved.csv', index=False)

print("Submission saved as 'submission_improved.csv'")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[num_cols] = scaler.fit_transform(X[num_cols])


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters: {'subsample': 0.7, 'num_leaves': 51, 'n_estimators': 800, 'min_child_samples': 20, 'max_depth': 9, 'learning_rate': 0.1, 'colsample_bytree': 1.0}
Best CV F1 score: 0.4515051751903821
Best threshold: 0.695 → CV F1: 0.4925


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[num_cols] = scaler.transform(X_test[num_cols])


Find Best Threshold

In [None]:
# Cell 7: Predict on test set and save submission
print("Making final predictions...")

test_proba = lgb_model.predict_proba(X_test)[:, 1]
test_pred = (test_proba >= best_thresh).astype(int)

submission_df['is_promoted'] = test_pred
submission_df.to_csv('submission_final.csv', index=False)

print("Submission saved as 'submission_final.csv'")
print("Ready to download and submit!")

# Download in Colab
from google.colab import files
files.download('submission_final.csv')