In [61]:
import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

import joblib


In [64]:
DATA_PATH = "students.csv"
df = pd.read_csv(DATA_PATH)

print("Dataset shape:", df.shape)
df.head()


Dataset shape: (2000, 24)


Unnamed: 0,Student ID,Name,Age,Gender,Branch,Average GPA,Backlogs,Attendance (%),Clubs,Skills,...,CTC (LPA),Alumni Path,Sem1 GPA,Sem2 GPA,Sem3 GPA,Sem4 GPA,Sem5 GPA,Sem6 GPA,Sem7 GPA,Sem8 GPA
0,b37412a3-fa5c-40ca-b087-7109a47482d7,Amit,21.0,Male,MECH,6.99,0.0,70.45,"Literary Society, Robotics","C++, Machine Learning, Python, Java",...,0.0,,8.34,5.24,6.06,7.81,7.18,7.04,8.26,5.99
1,65001423-faeb-4d19-a550-8ea26133a34d,Tina Ballard,19.0,Male,ECE,6.46,0.0,65.45,"Literary Society, Coding Club","C++, SQL, Web Development",...,0.0,,7.37,6.37,6.45,5.99,6.55,6.99,6.46,5.52
2,228aaf1d-8710-4c8d-bfec-001b0df01f34,Meena,19.0,Female,CIVIL,7.08,0.0,84.79,Literary Society,"Data Science, SQL, Machine Learning",...,0.0,,5.79,8.82,7.03,6.92,7.67,6.01,7.75,6.62
3,1aa67474-e200-4157-ad54-c95aaa4d52a8,Christopher Morales,23.0,Male,ECE,6.92,2.0,62.26,"Coding Club, Entrepreneurship Cell","Python, SQL, Java",...,5.89,Research,7.9,4.26,8.19,7.52,8.32,6.8,4.94,7.43
4,5122db4a-14c8-44d7-b2f4-e869f69d3dd7,Siddharth,20.0,Male,EEE,7.09,0.0,97.12,Robotics,"C++, SQL, Java, Python",...,0.0,,6.86,6.24,7.64,7.15,5.91,6.18,8.19,8.54


In [65]:
TARGET_COL = "Placement Status"

df[TARGET_COL] = df[TARGET_COL].map({
    "Placed": 1,
    "Not Placed": 0
})

df = df.dropna(subset=[TARGET_COL]).reset_index(drop=True)

y = df[TARGET_COL].values

assert not np.isnan(y).any(), "❌ Target still has NaN"
print("Target distribution:")
print(pd.Series(y).value_counts())


Target distribution:
1.0    1032
0.0     967
Name: count, dtype: int64


In [66]:
NUMERIC_COLS = [
    "Age",
    "Average GPA",
    "Backlogs",
    "Attendance (%)",
    "Sem1 GPA", "Sem2 GPA", "Sem3 GPA", "Sem4 GPA",
    "Sem5 GPA", "Sem6 GPA", "Sem7 GPA", "Sem8 GPA"
]

df[NUMERIC_COLS] = df[NUMERIC_COLS].fillna(df[NUMERIC_COLS].median())

X_numeric = df[NUMERIC_COLS].values
feature_names = NUMERIC_COLS.copy()


In [67]:
df["Internship Done"] = df["Internship Done"].map({"Yes": 1, "No": 0}).fillna(0)

X_binary = df[["Internship Done"]].values
feature_names.append("Internship Done")


In [None]:
branch_ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
branch_encoded = branch_ohe.fit_transform(df[["Branch"]])

branch_feature_names = branch_ohe.get_feature_names_out(["Branch"])
feature_names.extend(branch_feature_names.tolist())



['branch_ohe.joblib']

In [69]:
sbert = SentenceTransformer("all-MiniLM-L6-v2")


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 345.16it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [70]:
SKILLS_COL = "Skills"
df[SKILLS_COL] = df[SKILLS_COL].fillna("")

skill_texts = df[SKILLS_COL].tolist()

print("Embedding skills...")
skill_embeddings = sbert.encode(
    skill_texts,
    show_progress_bar=True
)

feature_names.extend([f"skill_emb_{i}" for i in range(skill_embeddings.shape[1])])


Embedding skills...


Batches: 100%|██████████| 63/63 [00:06<00:00,  9.89it/s]


In [71]:
CLUBS_COL = "Clubs"
df[CLUBS_COL] = df[CLUBS_COL].fillna("")

club_texts = df[CLUBS_COL].tolist()

print("Embedding clubs...")
club_embeddings = sbert.encode(
    club_texts,
    show_progress_bar=True
)

feature_names.extend([f"club_emb_{i}" for i in range(club_embeddings.shape[1])])


Embedding clubs...


Batches: 100%|██████████| 63/63 [00:13<00:00,  4.74it/s]


In [72]:
INTERNSHIP_DOMAIN_COL = "Internship Domain"
df[INTERNSHIP_DOMAIN_COL] = df[INTERNSHIP_DOMAIN_COL].fillna("")

internship_domain_embeddings = sbert.encode(
    df[INTERNSHIP_DOMAIN_COL].tolist(),
    show_progress_bar=True
)

feature_names.extend([
    f"internship_domain_emb_{i}"
    for i in range(internship_domain_embeddings.shape[1])
])


Batches: 100%|██████████| 63/63 [00:02<00:00, 24.33it/s]


In [73]:
X = np.hstack([
    X_numeric,
    X_binary,
    branch_encoded,
    skill_embeddings,
    club_embeddings,
    internship_domain_embeddings
])

print("Final X shape:", X.shape)
print("Final y shape:", y.shape)

assert X.shape[0] == y.shape[0]
assert len(feature_names) == X.shape[1]


Final X shape: (1999, 1171)
Final y shape: (1999,)


In [74]:
joblib.dump(X, "X_features.pkl")
joblib.dump(y, "y_labels.pkl")
joblib.dump(feature_names, "feature_names.pkl")
joblib.dump(branch_ohe, "branch_encoder.pkl")
joblib.dump(sbert, "sbert_model.pkl")

print("✅ All preprocessing artifacts saved")


✅ All preprocessing artifacts saved
