In [1]:
import sklearn
print(sklearn.__version__)


1.3.2


In [2]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns
from imblearn.over_sampling import SMOTE, ADASYN
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [3]:
file_path = 'updated_placement_by_aptitude.csv'
df = pd.read_csv(file_path)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219 entries, 0 to 218
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Roll No.                219 non-null    int64  
 1   Student code            219 non-null    object 
 2   Name                    219 non-null    object 
 3   marks_10                219 non-null    float64
 4   marks_12                219 non-null    float64
 5   Marks_sem1              213 non-null    float64
 6   Marks_sem2              202 non-null    float64
 7   Marks_sem3              197 non-null    float64
 8   Marks_sem4              190 non-null    float64
 9   Marks_sem5              189 non-null    float64
 10  Marks_sem6              193 non-null    float64
 11  Marks_sem7              199 non-null    float64
 12  Marks_sem8              208 non-null    float64
 13  CGPA_Final              208 non-null    object 
 14  Backlog_Count           0 non-null      fl

In [5]:
new_features = ['Marks_sem1 ', 'Marks_sem2', 'Marks_sem3', 'Marks_sem4',
                 'Marks_sem5', 'Marks_sem6', 'Marks_sem7', 'Marks_sem8', 'Soft_Skills_Score', 'Aptitude_Score']

In [6]:
# train ML model
X = df[new_features]

# Handle any missing values (filling with mean for simplicity)
X = X.fillna(X.median())

'''le = LabelEncoder()'''
y = df['Placement_domain']

In [7]:
y.head(10)

0        tech
1    non-tech
2    non-tech
3    non-tech
4    non-tech
5    non-tech
6    non-tech
7        tech
8    non-tech
9        tech
Name: Placement_domain, dtype: object

In [8]:
# Adjust k_neighbors for SMOTE to be less than the number of minority samples (which is 5 for 'Advanced')
smote = SMOTE(sampling_strategy='minority', k_neighbors=4)
x_sm, y_sm = smote.fit_resample(X, y)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    x_sm, y_sm, test_size=0.2, random_state=16, stratify=y_sm
)

In [10]:
clf = RandomForestClassifier(n_estimators = 500)
clf.fit(X_train, y_train)

pred = clf.predict(X_test)
# Fix: Use target_names appropriate for the binary classification (Placement_domain)
report = classification_report(y_test, pred, target_names=['Not Placed', 'Placed'])
print(report)

              precision    recall  f1-score   support

  Not Placed       0.87      0.96      0.91        27
      Placed       0.96      0.86      0.91        28

    accuracy                           0.91        55
   macro avg       0.91      0.91      0.91        55
weighted avg       0.91      0.91      0.91        55



In [11]:
clf.predict([[20, 70, 60, 70, 60, 65, 50, 45, 40, 68]])



array(['non-tech'], dtype=object)

In [12]:
# --- Save model (pickle) ---
with open('Interview_Preparedness.pkl', 'wb') as f:
    pickle.dump({'model': clf}, f)

In [13]:
# clf = MLPClassifier(hidden_layer_sizes=(500, 100), max_iter=1000, activation = 'relu', solver = 'adam', batch_size = 32, learning_rate_init = 0.0001)
# clf.fit(X_train, y_train)

# pred = clf.predict(X_test)
# report = classification_report(y_test, pred)
# print(report)

In [14]:
# import numpy as np
# from sklearn.metrics import log_loss

# def flatten_params(coefs, intercepts):
#     return np.concatenate(
#         [w.ravel() for w in coefs] +
#         [b.ravel() for b in intercepts]
#     )

# def unflatten_params(flat, shapes):
#     params = []
#     idx = 0
#     for shape in shapes:
#         size = np.prod(shape)
#         params.append(flat[idx:idx+size].reshape(shape))
#         idx += size
#     return params

# original_params = flatten_params(clf.coefs_, clf.intercepts_)
# shapes = [w.shape for w in clf.coefs_] + [b.shape for b in clf.intercepts_]


In [15]:
# np.random.seed(42)

# d1 = np.random.randn(original_params.size)
# d2 = np.random.randn(original_params.size)

# d1 /= np.linalg.norm(d1)
# d2 /= np.linalg.norm(d2)


In [16]:
# alphas = np.linspace(-1, 1, 25)
# betas = np.linspace(-1, 1, 25)

# loss_surface = np.zeros((len(alphas), len(betas)))

# for i, a in enumerate(alphas):
#     for j, b in enumerate(betas):
#         new_params = original_params + a * d1 + b * d2
#         new_params_list = unflatten_params(new_params, shapes)

#         # Assign perturbed parameters back to clf
#         clf.coefs_ = new_params_list[:len(clf.coefs_)]
#         clf.intercepts_ = new_params_list[len(clf.coefs_):]

#         probs = clf.predict_proba(X)
#         loss_surface[i, j] = log_loss(y, probs)


In [17]:
# import matplotlib.pyplot as plt

# A, B = np.meshgrid(alphas, betas)

# plt.figure(figsize=(8, 6))
# plt.contourf(A, B, loss_surface.T, levels=30)
# plt.colorbar()
# plt.xlabel("Direction 1")
# plt.ylabel("Direction 2")
# plt.title("Loss Surface Slice of sklearn MLPClassifier (clf)")
# plt.show()
