<a href="https://colab.research.google.com/github/Rashin-Rafeeq/AI_Assignments/blob/main/Third_Intermediate_Assessment_Supervised_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Libraries

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

Load data

In [5]:
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')
submission_df = pd.read_csv('/content/sample_submission1.csv')

In [6]:
print(train_df.describe())


        employee_id  no_of_trainings           age  previous_year_rating  \
count  54808.000000     54808.000000  54808.000000          50684.000000   
mean   39195.830627         1.253011     34.803915              3.329256   
std    22586.581449         0.609264      7.660169              1.259993   
min        1.000000         1.000000     20.000000              1.000000   
25%    19669.750000         1.000000     29.000000              3.000000   
50%    39225.500000         1.000000     33.000000              3.000000   
75%    58730.500000         1.000000     39.000000              4.000000   
max    78298.000000        10.000000     60.000000              5.000000   

       length_of_service  KPIs_met >80%   awards_won?  avg_training_score  \
count       54808.000000   54808.000000  54808.000000        54808.000000   
mean            5.865512       0.351974      0.023172           63.386750   
std             4.265094       0.477590      0.150450           13.371559   
min    

In [7]:
print(train_df.isnull().sum())


employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64


In [8]:
print(train_df['is_promoted'].value_counts(normalize=True))

is_promoted
0    0.91483
1    0.08517
Name: proportion, dtype: float64


Pre-processing

In [9]:
imputer_rating = SimpleImputer(strategy='median')
train_df['previous_year_rating'] = imputer_rating.fit_transform(train_df[['previous_year_rating']]).ravel()
test_df['previous_year_rating'] = imputer_rating.transform(test_df[['previous_year_rating']]).ravel()

imputer_edu = SimpleImputer(strategy='most_frequent')
train_df['education'] = imputer_edu.fit_transform(train_df[['education']]).ravel()
test_df['education'] = imputer_edu.transform(test_df[['education']]).ravel()

cat_cols = ['department', 'region', 'education', 'gender', 'recruitment_channel']

for col in cat_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    # Handle unknown in test
    test_df[col] = test_df[col].apply(lambda x: x if x in le.classes_ else '<Unknown>')
    if '<Unknown>' not in le.classes_:
        le.classes_ = np.append(le.classes_, '<Unknown>')
    test_df[col] = le.transform(test_df[col])

features = ['department', 'region', 'education', 'gender', 'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating', 'length_of_service', 'KPIs_met >80%', 'awards_won?', 'avg_training_score']

X = train_df[features]
y = train_df['is_promoted']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

num_cols = ['no_of_trainings', 'age', 'previous_year_rating', 'length_of_service', 'avg_training_score']
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_val[num_cols] = scaler.transform(X_val[num_cols])

# Modeling and fine tuning
scale_pos = (y_train == 0).sum() / (y_train == 1).sum()
xgb = XGBClassifier(random_state=42, scale_pos_weight=scale_pos)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1]
}

grid = GridSearchCV(xgb, param_grid, cv=3, scoring='f1')
grid.fit(X_train, y_train)

best_xgb = grid.best_estimator_
y_pred = best_xgb.predict(X_val)
print("Validation F1 Score:", f1_score(y_val, y_pred))

# Predict on test
X_test = test_df[features]
X_test[num_cols] = scaler.transform(X_test[num_cols])
predictions = best_xgb.predict(X_test)

# Update and save submission
submission_df['is_promoted'] = predictions
submission_df.to_csv('submission.csv', index=False)



Validation F1 Score: 0.392204628501827


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[num_cols] = scaler.transform(X_test[num_cols])


In [10]:
print("Submission saved to submission.csv")

Submission saved to submission.csv
