In [40]:
# Placement Cell – End-to-End Preprocessing + Predictive Model (Jupyter-ready)

# ==========
# 1) Imports
# ==========
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier



In [42]:
# ====================================
# 2) Load dataset (update path if needed)
# ====================================
df = pd.read_csv("placement_data_30.csv")

print("Raw shape:", df.shape)
display(df.head())



Raw shape: (31, 10)


Unnamed: 0,Name,Tenth_Percent,Twelfth_Percent,FE_Percent,SE_Percent,TE_Percent,Certifications,Projects_Completed,Internships,Placed
0,Aarav Joshi,88.5,82.3,70.2,72.1,75.4,2,3,1,Yes
1,Diya Mehta,92.1,85.4,78.5,80.1,82.6,4,4,2,Yes
2,Kabir Shah,75.0,70.2,65.3,66.7,69.5,1,2,1,No
3,Ishaan Verma,81.2,79.3,69.4,71.0,73.2,2,2,1,Yes
4,Anaya Rao,89.8,84.1,76.8,78.9,80.3,3,3,2,Yes


In [44]:
# =============================================================
# 3) Basic housekeeping – target encoding + duplicate handling
# =============================================================
# Expect 'Placed' column with values like 'Yes'/'No'.
# Convert to binary 1/0.
if 'Placed' not in df.columns:
    raise ValueError("Expected a 'Placed' column in the dataset.")

df['Placed'] = df['Placed'].astype(str).str.strip().str.lower().map({'yes':1, 'no':0})
if df['Placed'].isna().any():
    raise ValueError("Unable to map some 'Placed' values to 1/0. Please normalize the labels to Yes/No.")

# Drop exact duplicate rows (if any)
before = df.shape[0]
df = df.drop_duplicates()
after = df.shape[0]
print(f"Duplicates removed: {before - after}")



Duplicates removed: 0


In [46]:
# ============================================
# 4) Separate features / target, find col types
# ============================================
target = 'Placed'
feature_cols = [c for c in df.columns if c != target]

# Identify numeric vs categorical (Name is categorical; the rest look numeric)
numeric_cols = [c for c in feature_cols if pd.api.types.is_numeric_dtype(df[c])]
categorical_cols = [c for c in feature_cols if c not in numeric_cols]

print("Numeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)



Numeric columns: ['Tenth_Percent', 'Twelfth_Percent', 'FE_Percent', 'SE_Percent', 'TE_Percent', 'Certifications', 'Projects_Completed', 'Internships']
Categorical columns: ['Name']


In [48]:
# =======================================
# 5) Manage missing values (imputation)
# =======================================
# We'll impute numerics with median, categoricals with most_frequent later in the pipeline.

# =========================================
# 6) Handle outliers (IQR clipping on numerics)
# =========================================
def iqr_clip(frame, cols, k=1.5):
    """Clip numeric columns by IQR to reduce effect of extreme outliers."""
    frame = frame.copy()
    for c in cols:
        q1 = frame[c].quantile(0.25)
        q3 = frame[c].quantile(0.75)
        iqr = q3 - q1
        low = q1 - k * iqr
        high = q3 + k * iqr
        frame[c] = frame[c].clip(lower=low, upper=high)
    return frame

df = iqr_clip(df, numeric_cols, k=1.5)



In [50]:
# ======================================================
# 7) Scaling / Standardization / Normalization (all done)
# ======================================================
# We'll demonstrate both:
# - Standardization (StandardScaler) for model training (Logistic Regression)
# - Normalization (MinMaxScaler) available as an alternate toggle if needed
#
# To keep things simple and optimal, we’ll train two models:
#   a) Logistic Regression (with StandardScaler) – simple, interpretable
#   b) Random Forest (robust to outliers & scaling) – often strong baseline

# ColumnTransformer for preprocessing
standard_numeric = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler())  # standardization
])

# If you want strict normalization instead of standardization, switch StandardScaler -> MinMaxScaler:
normalized_numeric = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="median")),
    ("normalize", MinMaxScaler())  # normalization
])

categorical_proc = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", drop=None))
])

standard_preprocessor = ColumnTransformer(
    transformers=[
        ("num", standard_numeric, numeric_cols),
        ("cat", categorical_proc, categorical_cols),
    ],
    remainder="drop"
)

normalized_preprocessor = ColumnTransformer(
    transformers=[
        ("num", normalized_numeric, numeric_cols),
        ("cat", categorical_proc, categorical_cols),
    ],
    remainder="drop"
)



In [52]:
# ================================
# 8) Train/test split & model fit
# ================================
X = df[feature_cols]
y = df[target].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=max(0.2, min(0.3, 1.0 - 20/len(df))) if len(df) > 25 else 0.25, random_state=42, stratify=y
)

# Pipelines
logreg_pipeline = Pipeline(steps=[
    ("pre", standard_preprocessor),
    ("clf", LogisticRegression(max_iter=1000, class_weight="balanced", n_jobs=None))
])

rf_pipeline = Pipeline(steps=[
    ("pre", standard_preprocessor),  # scaling not required for RF but harmless
    ("clf", RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        min_samples_split=2,
        random_state=42,
        class_weight="balanced_subsample"
    ))
])

# Fit
logreg_pipeline.fit(X_train, y_train)
rf_pipeline.fit(X_train, y_train)



In [54]:
# ===============
# 9) Evaluation
# ===============
def evaluate(model, name):
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, zero_division=0)
    print(f"\n{name} → Accuracy: {acc:.3f} | F1: {f1:.3f}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, preds))
    print(classification_report(y_test, preds, zero_division=0))

evaluate(logreg_pipeline, "Logistic Regression")
evaluate(rf_pipeline, "Random Forest")

# Pick the better model by F1 (you can switch metric if you prefer)
def pick_best(m1, m2, names=("Model1","Model2")):
    p1 = m1.predict(X_test)
    p2 = m2.predict(X_test)
    f1_1 = f1_score(y_test, p1, zero_division=0)
    f1_2 = f1_score(y_test, p2, zero_division=0)
    return (m1, names[0], f1_1) if f1_1 >= f1_2 else (m2, names[1], f1_2)

best_model, best_name, best_f1 = pick_best(logreg_pipeline, rf_pipeline, ("Logistic Regression", "Random Forest"))
print(f"\nSelected model: {best_name} (F1={best_f1:.3f})")




Logistic Regression → Accuracy: 1.000 | F1: 1.000
Confusion Matrix:
 [[3 0]
 [0 7]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00         7

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10


Random Forest → Accuracy: 1.000 | F1: 1.000
Confusion Matrix:
 [[3 0]
 [0 7]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00         7

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10


Selected model: Logistic Regression (F1=1.000)


In [56]:
# ====================================================
# 10) Predict for “2024–25 batch” sample candidate(s)
# ====================================================
# Your dataset doesn't contain an explicit 'batch' column, so we treat new candidates
# as 2024–25 batch samples by directly providing their features here.

sample_cases = pd.DataFrame([
    {
        # Sample Candidate 1 (strong profile)
        "Name": "Sample_2025_A",
        "Tenth_Percent": 88.0,
        "Twelfth_Percent": 85.0,
        "FE_Percent": 78.0,
        "SE_Percent": 80.0,
        "TE_Percent": 82.0,
        "Certifications": 3,
        "Projects_Completed": 4,
        "Internships": 1
    },
    {
        # Sample Candidate 2 (average profile)
        "Name": "Sample_2025_B",
        "Tenth_Percent": 70.0,
        "Twelfth_Percent": 68.0,
        "FE_Percent": 62.0,
        "SE_Percent": 60.0,
        "TE_Percent": 61.0,
        "Certifications": 1,
        "Projects_Completed": 2,
        "Internships": 0
    }
])

# Ensure columns match the training features
missing_cols = [c for c in feature_cols if c not in sample_cases.columns]
for c in missing_cols:
    sample_cases[c] = np.nan  # will be imputed

sample_cases = sample_cases[feature_cols]

sample_pred = best_model.predict(sample_cases)
sample_prob = (best_model.predict_proba(sample_cases)[:, 1]
               if hasattr(best_model, "predict_proba") else np.full(len(sample_cases), np.nan))

out = sample_cases.copy()
out["Predicted_Placement"] = np.where(sample_pred==1, "Placed", "Not Placed")
out["Confidence(Prob)"] = np.round(sample_prob, 3)
display(out)

# ==========================
# 11) (Optional) Save model
# ==========================
# from joblib import dump
# dump(best_model, "/mnt/data/placement_model.joblib")
# print("Saved model to /mnt/data/placement_model.joblib")

Unnamed: 0,Name,Tenth_Percent,Twelfth_Percent,FE_Percent,SE_Percent,TE_Percent,Certifications,Projects_Completed,Internships,Predicted_Placement,Confidence(Prob)
0,Sample_2025_A,88.0,85.0,78.0,80.0,82.0,3,4,1,Placed,0.993
1,Sample_2025_B,70.0,68.0,62.0,60.0,61.0,1,2,0,Not Placed,0.014
