In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import joblib
import os
import numpy as np


In [18]:
CSV_PATH = r"D:\fastapi\data\New folder\Engineering.csv"  # <-- change filename
MODEL_PATH = r"D:\fastapi\college_predictor.pkl"

In [19]:
df = pd.read_csv(CSV_PATH)
print("Loaded", len(df), "rows")


Loaded 64958 rows


In [21]:
rows = []
for _, row in df.iterrows():
    # generate a few sample student ranks for this program
    max_rank = float(row['closing_rank'])
    # sample ranks between 1 and 1.5 * closing_rank
    sample_ranks = np.linspace(1, max_rank * 1.5, 5)
    for student_rank in sample_ranks:
        rows.append({
            'student_rank': student_rank,
            'program_name': row['program_name'],
            'category': row['category'],
            'eligible': 1 if student_rank <= max_rank else 0
        })
synthetic = pd.DataFrame(rows)
print("Synthetic training examples:", len(synthetic))

Synthetic training examples: 324790


In [23]:
X = synthetic[['student_rank','program_name','category']]
y = synthetic['eligible']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [28]:
preproc = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=True),
     ['program_name','category']),
    ('num', 'passthrough', ['student_rank'])
])


model = Pipeline([
    ('prep', preproc),
    ('rf', RandomForestClassifier(
        n_estimators=100,    # fewer trees to save memory
        max_depth=12,
        random_state=42,
        n_jobs=-1
    ))
])


In [30]:
model.fit(X_train, y_train)
print("Validation accuracy:", model.score(X_test, y_test))

Validation accuracy: 0.7260383632501001


In [11]:
joblib.dump(model, MODEL_PATH)
print(f"Model saved to {MODEL_PATH}")

Model saved to D:\fastapi\college_predictor.pkl


In [33]:
MODEL_PATH = r"D:\fastapi\college_eligibility_predictor.pkl"
joblib.dump(model, MODEL_PATH)
print("Model saved to", MODEL_PATH)

Model saved to D:\fastapi\college_eligibility_predictor.pkl


In [34]:
model = joblib.load(r"D:\fastapi\college_eligibility_predictor.pkl")

In [35]:
def recommend_programs(student_rank, program_interest, top_k=3):
    # Filter dataset by interest keyword
    subset = df[df['program_name'].str.contains(program_interest, case=False, na=False)].copy()
    if subset.empty:
        return []

In [36]:
X_new = pd.DataFrame({
        'student_rank': [student_rank]*len(subset),
        'program_name': subset['program_name'],
        'category': subset['category']
    })


NameError: name 'subset' is not defined