In [14]:
# Load dataset
import os 
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
DATA2_DIR = os.path.join(BASE_DIR, "Data", "processed")

In [15]:
# Read Dataset
import pandas as pd
df=os.path.join(DATA2_DIR,"placement_engineered.csv")
placement_eng_df=pd.read_csv(df)
placement_eng_df.head()
placement_eng_df.columns

Index(['cgpa', 'college_tier', 'branch', 'internship_count', 'project_count',
       'skills_score', 'communication_score', 'certifications', 'dsa_score',
       'placed', 'cgpa_norm', 'internship_count_norm', 'project_count_norm',
       'skills_score_norm', 'communication_score_norm', 'certifications_norm',
       'dsa_score_norm', 'profile_strength_index', 'technical_strength',
       'cgpa_internship', 'project_skill_interaction',
       'internship_communication', 'psi_tech_interaction', 'college_weight'],
      dtype='object')

In [16]:
# Feature Selection 
X=placement_eng_df.drop("placed",axis=1)
X = pd.get_dummies(X, drop_first=True)
Y=placement_eng_df["placed"]

In [17]:
# Train-Test spliting
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(
    X,
    Y,
    test_size=0.2,
    random_state=23,
    stratify=Y
)

In [18]:
# Scalling for logistic 
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train_Scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [19]:
# Train the logistic model(Baseline Model)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,roc_auc_score
log=LogisticRegression()
log.fit(X_train_Scaled,Y_train)
Y_predict_log=log.predict(X_test_scaled)
Y_prob_log=log.predict_proba(X_test_scaled)[:,1]
print(classification_report(Y_test,Y_predict_log))
print(roc_auc_score(Y_test,Y_prob_log))

              precision    recall  f1-score   support

           0       0.87      0.88      0.88      6375
           1       0.79      0.77      0.78      3625

    accuracy                           0.84     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.84      0.84      0.84     10000

0.9206376849222447


In [20]:
# Second Model Train for best decision(Random Forest)
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)

rf.fit(X_train, Y_train)

Y_pred_rf = rf.predict(X_test)
Y_prob_rf = rf.predict_proba(X_test)[:, 1]

print(classification_report(Y_test, Y_pred_rf))
print("ROC-AUC:", roc_auc_score(Y_test, Y_prob_rf))

              precision    recall  f1-score   support

           0       0.96      0.92      0.94      6375
           1       0.87      0.93      0.90      3625

    accuracy                           0.92     10000
   macro avg       0.91      0.93      0.92     10000
weighted avg       0.93      0.92      0.92     10000

ROC-AUC: 0.9700994942528736


In [22]:
# Final Model save 
import pickle
import os
Model_Path=os.path.join(BASE_DIR,"models")
os.makedirs(Model_Path,exist_ok=True)
with open(os.path.join(Model_Path,"model1.pkl"),"wb")as f:
    pickle.dump(rf,f)
with open(os.path.join(Model_Path,"scaler.pkl"),"wb")as f:
    pickle.dump(scaler,f)