In [2]:
import sys
import os
sys.path.append(os.path.abspath("../.."))  # allow app imports

import numpy as np
import pickle
from sentence_transformers import SentenceTransformer

from app.database import SessionLocal
from app.models.student import StudentProfile
from app.models.job import Job

In [4]:
MODEL_PATH = "xgboost_placement_model.pkl"
FEATURE_NAMES_PATH = "feature_names.pkl"

with open(MODEL_PATH, "rb") as f:
    model = pickle.load(f)

with open(FEATURE_NAMES_PATH, "rb") as f:
    FEATURE_NAMES = pickle.load(f)

print("Model expects features:", model.n_features_in_)
print("Feature names loaded:", len(FEATURE_NAMES))

Model expects features: 1171
Feature names loaded: 1171


In [5]:
sbert = SentenceTransformer("all-MiniLM-L6-v2")

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 167.42it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [6]:
db = SessionLocal()

students = db.query(StudentProfile).all()
jobs = db.query(Job).all()

print("Students in DB:", len(students))
print("Jobs in DB:", len(jobs))

Students in DB: 4
Jobs in DB: 2


In [11]:
def build_student_feature_vector(student):
    # ---------- Numeric features (EXACT training order) ----------
    numeric = np.array([
        student.age or 0.0,
        student.cgpa or 0.0,
        student.backlogs or 0.0,
        student.attendance or 0.0,

        student.sem1_gpa or 0.0,
        student.sem2_gpa or 0.0,
        student.sem3_gpa or 0.0,
        student.sem4_gpa or 0.0,
        student.sem5_gpa or 0.0,
        student.sem6_gpa or 0.0,
        student.sem7_gpa or 0.0,
        student.sem8_gpa or 0.0,

        1.0 if (student.internships or 0) > 0 else 0.0
    ], dtype=float)

    # ---------- Branch one-hot (ORDER MATTERS) ----------
    branches = ["CIVIL", "CSE", "ECE", "EEE", "IT", "MECH"]
    branch_vec = np.array(
        [1.0 if student.department == b else 0.0 for b in branches],
        dtype=float
    )

    # ---------- Skill embedding ----------
    skill_text = student.skills or ""
    skill_emb = sbert.encode([skill_text])  # (1, 384)

    # ---------- Club embedding ----------
    club_text = student.clubs or ""
    club_emb = sbert.encode([club_text])    # (1, 384)

    # ---------- Internship domain embedding ----------
    internship_domain_text = student.internship_domain or ""
    internship_domain_emb = sbert.encode([internship_domain_text])  # (1, 384)

    # ---------- Final vector ----------
    X = np.hstack([
        numeric.reshape(1, -1),      # (1, 13)
        branch_vec.reshape(1, -1),   # (1, 6)
        skill_emb,                   # (1, 384)
        club_emb,                    # (1, 384)
        internship_domain_emb        # (1, 384)
    ])

    return X


In [12]:
X_student = build_student_feature_vector(students[0])

print("Student vector size:", X_student.shape[1])
print("Model expects:", model.n_features_in_)

assert X_student.shape[1] == model.n_features_in_
print("✅ Feature alignment PERFECT")

Student vector size: 1171
Model expects: 1171
✅ Feature alignment PERFECT


In [14]:
placement_probs = []

for student in students:
    X_student = build_student_feature_vector(student)

    # Sanity check (keep this while developing)
    assert X_student.shape[1] == model.n_features_in_

    prob = model.predict_proba(X_student)[0][1]  # probability of "Placed"
    
    placement_probs.append({
        "student_id": student.id,
        "name": student.full_name,
        "placement_probability": round(float(prob), 4)
    })

placement_probs[:5]

[{'student_id': 5,
  'name': 'Vaishnavi Kamthe',
  'placement_probability': 0.5346},
 {'student_id': 6, 'name': 'Amit Sharma', 'placement_probability': 0.3741},
 {'student_id': 7, 'name': 'Sneha Patil', 'placement_probability': 0.5615},
 {'student_id': 8, 'name': 'Rahul Verma', 'placement_probability': 0.3053}]

In [15]:
import numpy as np

probs = np.array([p["placement_probability"] for p in placement_probs])

min_p, max_p = probs.min(), probs.max()

for p in placement_probs:
    p["placement_score"] = (
        (p["placement_probability"] - min_p) / (max_p - min_p)
        if max_p != min_p else 0.5
    )

placement_probs

[{'student_id': 5,
  'name': 'Vaishnavi Kamthe',
  'placement_probability': 0.5346,
  'placement_score': np.float64(0.8950039032006244)},
 {'student_id': 6,
  'name': 'Amit Sharma',
  'placement_probability': 0.3741,
  'placement_score': np.float64(0.2685402029664324)},
 {'student_id': 7,
  'name': 'Sneha Patil',
  'placement_probability': 0.5615,
  'placement_score': np.float64(1.0)},
 {'student_id': 8,
  'name': 'Rahul Verma',
  'placement_probability': 0.3053,
  'placement_score': np.float64(0.0)}]

In [18]:
job = db.query(Job).first()
job_text = " ".join([
    job.title or "",
    job.required_skills or "",
    job.description or ""
]).strip()

job_text


'Data Analyst python,sql,data analysis Analytics, SQL, Python'

In [19]:
job_embedding = sbert.encode([job_text], normalize_embeddings=True)
job_embedding.shape

(1, 384)

In [20]:
def build_student_text(student):
    parts = [
        student.skills or "",
        student.internship_domain or "",
        student.clubs or ""
    ]
    return " ".join(parts).strip()


In [21]:
from sklearn.metrics.pairwise import cosine_similarity

job_matches = []

for student in students:
    student_text = build_student_text(student)
    student_emb = sbert.encode([student_text], normalize_embeddings=True)

    similarity = cosine_similarity(job_embedding, student_emb)[0][0]

    job_matches.append({
        "student_id": student.id,
        "name": student.full_name,
        "job_fit_score": float(similarity)
    })

job_matches


[{'student_id': 5,
  'name': 'Vaishnavi Kamthe',
  'job_fit_score': 0.653712272644043},
 {'student_id': 6, 'name': 'Amit Sharma', 'job_fit_score': 0.2357364147901535},
 {'student_id': 7, 'name': 'Sneha Patil', 'job_fit_score': 0.5401564836502075},
 {'student_id': 8,
  'name': 'Rahul Verma',
  'job_fit_score': -0.027820546180009842}]

In [22]:
import numpy as np

job_scores = np.array([j["job_fit_score"] for j in job_matches])

min_j, max_j = job_scores.min(), job_scores.max()

for j in job_matches:
    j["job_fit_norm"] = (
        (j["job_fit_score"] - min_j) / (max_j - min_j)
        if max_j != min_j else 0.5
    )

job_matches

[{'student_id': 5,
  'name': 'Vaishnavi Kamthe',
  'job_fit_score': 0.653712272644043,
  'job_fit_norm': np.float64(1.0)},
 {'student_id': 6,
  'name': 'Amit Sharma',
  'job_fit_score': 0.2357364147901535,
  'job_fit_norm': np.float64(0.38671206094655325)},
 {'student_id': 7,
  'name': 'Sneha Patil',
  'job_fit_score': 0.5401564836502075,
  'job_fit_norm': np.float64(0.8333817743512782)},
 {'student_id': 8,
  'name': 'Rahul Verma',
  'job_fit_score': -0.027820546180009842,
  'job_fit_norm': np.float64(0.0)}]

In [23]:
final_matches = []

for jm in job_matches:
    student_id = jm["student_id"]

    placement_score = next(
        p["placement_score"] for p in placement_probs
        if p["student_id"] == student_id
    )

    final_score = (
        0.6 * jm["job_fit_norm"] +
        0.4 * placement_score
    )

    final_matches.append({
        "student_id": student_id,
        "name": jm["name"],
        "job_fit_norm": round(float(jm["job_fit_norm"]), 4),
        "placement_score": round(float(placement_score), 4),
        "final_score": round(float(final_score), 4)
    })

sorted(final_matches, key=lambda x: x["final_score"], reverse=True)

[{'student_id': 5,
  'name': 'Vaishnavi Kamthe',
  'job_fit_norm': 1.0,
  'placement_score': 0.895,
  'final_score': 0.958},
 {'student_id': 7,
  'name': 'Sneha Patil',
  'job_fit_norm': 0.8334,
  'placement_score': 1.0,
  'final_score': 0.9},
 {'student_id': 6,
  'name': 'Amit Sharma',
  'job_fit_norm': 0.3867,
  'placement_score': 0.2685,
  'final_score': 0.3394},
 {'student_id': 8,
  'name': 'Rahul Verma',
  'job_fit_norm': 0.0,
  'placement_score': 0.0,
  'final_score': 0.0}]