In [None]:
# AP EAMCET, TS EAMCET, JEE - Unified EDA, Visualization, and Model Comparison Notebook
# Description: Handles three separate datasets (AP, TS, JEE). Performs cleaning, EDA,
# visualizations, and compares multiple ML models (classification + regression) to
# find the best predictor of admission using ranks and cutoffs.

# -----------------------------------------------------------------------------
# Cell 1: Imports and Config
# -----------------------------------------------------------------------------
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

# ML imports
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression, HuberRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

# Try to import xgboost (install if missing)
try:
    import xgboost as xgb
    from xgboost import XGBClassifier
except Exception:
    print('xgboost not found. Attempting to install xgboost...')
    try:
        os.system(f'{sys.executable} -m pip install xgboost --quiet')
        import xgboost as xgb
        from xgboost import XGBClassifier
        print('xgboost installed.')
    except Exception as e:
        print('Could not install xgboost:', e)
        XGBClassifier = None

# Notebook display settings
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
plt.rcParams['figure.figsize'] = (10,6)

# File paths (update if needed)
AP_FILE = "/home/me3-lab/Project/data/AP_EAMCET_Cleaned_Merged.csv"
TS_FILE = "/home/me3-lab/Project/data/TSEAMCET_2021_2022_2023_merged_clean.csv"
JEE_FILE = "/home/me3-lab/Project/data/JEE_data.csv"
OUTPUT_DIR = "outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# -----------------------------------------------------------------------------
# Cell 2: Load Data
# -----------------------------------------------------------------------------
def load_csv(path):
    try:
        return pd.read_csv(path)
    except Exception as e:
        print(f"Error loading {path}: {e}")
        return None

ap_df = load_csv(AP_FILE)
ts_df = load_csv(TS_FILE)
jee_df = load_csv(JEE_FILE)

print('AP loaded:', ap_df is not None, 'TS loaded:', ts_df is not None, 'JEE loaded:', jee_df is not None)

# Quick shape
if ap_df is not None:
    print('AP shape:', ap_df.shape)
if ts_df is not None:
    print('TS shape:', ts_df.shape)
if jee_df is not None:
    print('JEE shape:', jee_df.shape)

# -----------------------------------------------------------------------------
# Cell 3: Build unified cutoff table (one row per college-branch-category)
# -----------------------------------------------------------------------------
records = []

if ap_df is not None:
    cat_cols = [c for c in ap_df.columns if any(k in c.upper() for k in ['BOYS','GIRLS','OC','BC','SC','ST','EWS'])]
    for _, row in ap_df.iterrows():
        for c in cat_cols:
            try:
                cutoff = float(row[c])
            except Exception:
                continue
            records.append({'exam':'AP','college':row.get('NAME_OF_THE_INSTITUTION',row.get('NAME',None)),'branch':row.get('branch_code',row.get('branch',None)),'category':c,'cutoff':cutoff})

if ts_df is not None:
    cat_cols = [c for c in ts_df.columns if any(k in c.lower() for k in ['boys','girls','oc','bc','sc','st','ews'])]
    for _, row in ts_df.iterrows():
        for c in cat_cols:
            try:
                cutoff = float(row[c])
            except Exception:
                continue
            records.append({'exam':'TS','college':row.get('institute_name',row.get('NAME',None)),'branch':row.get('branch',None),'category':c,'cutoff':cutoff})

if jee_df is not None:
    if 'closing_rank' in jee_df.columns:
        for _, row in jee_df.iterrows():
            try:
                cutoff = float(row['closing_rank'])
            except Exception:
                continue
            records.append({'exam':'JEE','college':row.get('institute_short',row.get('institute',None)),'branch':row.get('program_name',row.get('branch',None)),'category':row.get('category',None),'cutoff':cutoff})

cutoffs_df = pd.DataFrame(records)
print('Cutoffs table shape:', cutoffs_df.shape)
cutoffs_df.head()

cutoffs_df.to_csv(os.path.join(OUTPUT_DIR,'cutoffs_unified.csv'), index=False)

# -----------------------------------------------------------------------------
# Cell 4: Dataset analysis summary
# -----------------------------------------------------------------------------
print('\n--- Dataset analysis summary ---')
print('Total cutoff entries:', len(cutoffs_df))
print('Unique exams:', cutoffs_df['exam'].nunique(), '->', cutoffs_df['exam'].unique())
print('Unique colleges:', cutoffs_df['college'].nunique())
print('Unique branches:', cutoffs_df['branch'].nunique())
print('Categories sample:', cutoffs_df['category'].dropna().unique()[:10])
print('\nCutoff rank descriptive stats:')
print(cutoffs_df['cutoff'].describe())

# -----------------------------------------------------------------------------
# Cell 5: Create dataset for ML (synthetic student sampling)
# -----------------------------------------------------------------------------
np.random.seed(42)
student_rows = []
for _, row in cutoffs_df.iterrows():
    cutoff = row['cutoff']
    if cutoff is None or cutoff <= 0 or np.isnan(cutoff):
        continue  # skip invalid cutoffs
    thegeethika,queengeethika,not ur geethika,geethz,lil geet
    sample_ranks = [
        int(max(1, cutoff*0.5 + np.random.randint(-50,50))),
        int(max(1, cutoff*0.8 + np.random.randint(-30,30))),
        int(max(1, cutoff + np.random.randint(-20,20))),
        int(max(1, cutoff*1.1 + np.random.randint(-30,60))),
        int(max(1, cutoff*1.4 + np.random.randint(-50,100)))
    ]
    
    for r in sample_ranks:
        admit = 1 if r <= cutoff else 0
        if np.random.rand() < 0.03:
            admit = 1 - admit
        prob = max(0.0, min(1.0, 1 - (r / (cutoff+1e-6))))  # add epsilon to avoid div/0
        student_rows.append({
            'exam': row['exam'],
            'college': row['college'],
            'branch': row['branch'],
            'category': str(row['category']),
            'rank': r,
            'admit': admit,
            'admit_prob': prob
        })

students = pd.DataFrame(student_rows)
print('Synthetic students shape:', students.shape)
students.head()

students.sample(5).to_csv(os.path.join(OUTPUT_DIR,'students_sample.csv'), index=False)

# -----------------------------------------------------------------------------
# Remaining cells (encoding, model training, evaluation, etc.)
# -----------------------------------------------------------------------------
# [The rest of the notebook remains unchanged from the previous update]


# -----------------------------------------------------------------------------
# Cell 6: Feature encoding and train/test split
# -----------------------------------------------------------------------------
le_exam = LabelEncoder(); students['exam_enc']=le_exam.fit_transform(students['exam'])
le_cat = LabelEncoder(); students['cat_enc']=le_cat.fit_transform(students['category'])
le_branch = LabelEncoder(); students['branch_enc']=le_branch.fit_transform(students['branch'].astype(str))

X = students[['rank','exam_enc','cat_enc','branch_enc']]
y_clf = students['admit']
y_reg = students['admit_prob']

X_train, X_test, y_train_clf, y_test_clf = train_test_split(X, y_clf, test_size=0.2, random_state=42, stratify=y_clf)
X_train_r, X_test_r, y_train_reg, y_test_reg = train_test_split(X, y_reg, test_size=0.2, random_state=42)

scaler = StandardScaler(); X_train_scaled = scaler.fit_transform(X_train); X_test_scaled = scaler.transform(X_test)

# -----------------------------------------------------------------------------
# Cell 7: Define models to test
# -----------------------------------------------------------------------------
models_clf = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(n_estimators=200, random_state=42),
    'SVM': SVC(probability=True, gamma='scale'),
    'MLP': MLPClassifier(hidden_layer_sizes=(64,32), max_iter=500, random_state=42)
}
if 'XGBClassifier' in globals() and XGBClassifier is not None:
    models_clf['XGBoost'] = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Regression models for probability estimation
models_reg = {
    'Huber': HuberRegressor(),
}

# -----------------------------------------------------------------------------
# Cell 8: Safe Cross-validation & Training for Large Dataset
# -----------------------------------------------------------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = {}

# Reduce dataset size for cross-validation to manage memory
cv_sample_size = min(100000, len(X))  # max 100k rows
X_cv_sample, _, y_cv_sample, _ = train_test_split(X, y_clf, train_size=cv_sample_size, random_state=42, stratify=y_clf)

# Scale features for all models if needed
scaler = StandardScaler()
X_cv_scaled = scaler.fit_transform(X_cv_sample)

# Models
models_clf = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=2),
    'MLP': MLPClassifier(hidden_layer_sizes=(64,32), max_iter=500, random_state=42)
}

if 'XGBClassifier' in globals() and XGBClassifier is not None:
    models_clf['XGBoost'] = XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                                           n_estimators=200, max_depth=6, tree_method='hist',
                                           random_state=42, n_jobs=2)

# Train and evaluate classification models
for name, model in models_clf.items():
    print(f'\nTraining and cross-validating: {name}')
    
    # Cross-validation on sample
    try:
        acc_scores = cross_val_score(model, X_cv_sample, y_cv_sample, cv=cv, scoring='accuracy', n_jobs=2)
        try:
            roc_scores = cross_val_score(model, X_cv_sample, y_cv_sample, cv=cv, scoring='roc_auc', n_jobs=2)
        except Exception:
            roc_scores = np.array([np.nan]*len(acc_scores))
        print(f'CV Accuracy mean: {acc_scores.mean():.4f} ± {acc_scores.std():.4f}')
        print(f'CV ROC-AUC mean: {np.nanmean(roc_scores):.4f} ± {np.nanstd(roc_scores):.4f}')
    except Exception as e:
        print('CV failed:', e)
        acc_scores, roc_scores = [np.nan],[np.nan]
    
    # Fit on full training set carefully
    try:
        model.fit(X_train, y_train_clf)
        preds = model.predict(X_test)
        probs = model.predict_proba(X_test)[:,1] if hasattr(model,'predict_proba') else model.decision_function(X_test)
        acc = accuracy_score(y_test_clf, preds)
        prec = precision_score(y_test_clf, preds)
        rec = recall_score(y_test_clf, preds)
        f1 = f1_score(y_test_clf, preds)
        roc = roc_auc_score(y_test_clf, probs)
        results[name] = {'cv_acc_mean':acc_scores.mean(),'cv_acc_std':acc_scores.std(),
                         'cv_roc_mean':np.nanmean(roc_scores),'test_acc':acc,'test_prec':prec,
                         'test_rec':rec,'test_f1':f1,'test_roc':roc}
        print(f'Test Acc: {acc:.4f}, Prec: {prec:.4f}, Rec: {rec:.4f}, F1: {f1:.4f}, ROC: {roc:.4f}')
    except Exception as e:
        print(f"Error fitting {name} on full dataset:", e)
        results[name] = {'cv_acc_mean':acc_scores.mean(),'cv_acc_std':acc_scores.std(),
                         'cv_roc_mean':np.nanmean(roc_scores),'test_acc':np.nan,'test_prec':np.nan,
                         'test_rec':np.nan,'test_f1':np.nan,'test_roc':np.nan}

# -----------------------------------------------------------------------------
# Regression model (Huber) training safely
# -----------------------------------------------------------------------------
print('\nTraining regression models (probability estimation):')
X_cv_sample_r, _, y_cv_sample_reg, _ = train_test_split(X, y_reg, train_size=cv_sample_size, random_state=42)
scaler_cv = StandardScaler(); X_cv_sample_scaled = scaler_cv.fit_transform(X_cv_sample_r)

models_reg = {'Huber': HuberRegressor()}

for name, model in models_reg.items():
    print('Model:', name)
    try:
        cv_mse = cross_val_score(model, X_cv_sample_scaled, y_cv_sample_reg, cv=5,
                                 scoring='neg_mean_squared_error', n_jobs=2)
        print('CV MSE mean (sample):', -cv_mse.mean())
    except Exception as e:
        print('CV failed:', e)
    try:
        model.fit(X_train_scaled, y_train_reg)
        preds = model.predict(X_test_scaled)
        mse = mean_squared_error(y_test_reg, preds)
        r2 = r2_score(y_test_reg, preds)
        results[name] = {'test_mse':mse,'test_r2':r2}
        print(f'Test MSE: {mse:.6f}, R2: {r2:.4f}')
    except Exception as e:
        print(f"Error fitting {name} on full dataset:", e)
        results[name] = {'test_mse':np.nan,'test_r2':np.nan}

# Save results
res_df = pd.DataFrame(results).T
res_df.to_csv(os.path.join(OUTPUT_DIR,'model_comparison_results.csv'))
print('Model comparison saved to outputs/model_comparison_results.csv')

# -----------------------------------------------------------------------------
# Cell 9: Best model selection & save (safe version)
# -----------------------------------------------------------------------------
best_model_name = None
best_roc = -1

# Classification models first
for n, r in results.items():
    if n in models_clf:
        roc = r.get('test_roc', -1)
        if roc is None: continue
        try:
            if roc > best_roc:
                best_roc = roc
                best_model_name = n
        except Exception: continue

# Regression models if no classifier performed well
if best_model_name is None or best_roc <= 0:
    best_r2 = -1
    for n, r in results.items():
        if n in models_reg:
            r2 = r.get('test_r2', -1)
            if r2 is None: continue
            try:
                if r2 > best_r2:
                    best_r2 = r2
                    best_model_name = n
                    best_roc = r2
            except Exception: continue

print('\nBest model by test ROC/R2:', best_model_name, 'score:', best_roc)

# Persist best model
import joblib
if best_model_name in models_clf and not np.isnan(results.get(best_model_name, {}).get('test_acc', np.nan)):
    joblib.dump(models_clf[best_model_name], os.path.join(OUTPUT_DIR, f'model_best_{best_model_name}.joblib'))
elif best_model_name in models_reg and not np.isnan(results.get(best_model_name, {}).get('test_mse', np.nan)):
    joblib.dump(models_reg[best_model_name], os.path.join(OUTPUT_DIR, f'model_best_{best_model_name}.joblib'))
else:
    print(f"Could not save best model '{best_model_name}' as it either didn't perform well or fitting failed.")

# -----------------------------------------------------------------------------
# Cell 9: Model usage helper
# -----------------------------------------------------------------------------
# Function to load best model and predict admission + probability

def load_best_and_predict(rank, exam, category, branch):
    # load encoders
    ex_enc = le_exam.transform([exam])[0] if exam in le_exam.classes_ else 0
    cat_enc = le_cat.transform([category])[0] if category in le_cat.classes_ else 0
    br_enc = le_branch.transform([branch])[0] if branch in le_branch.classes_ else 0
    sample = pd.DataFrame([[rank, ex_enc, cat_enc, br_enc]], columns=['rank','exam_enc','cat_enc','branch_enc'])
    # load best model file
    if best_model_name is None:
        print('No best model determined.')
        return None
    path = os.path.join(OUTPUT_DIR, f'model_best_{best_model_name}.joblib')
    if not os.path.exists(path):
        print('Best model file not found:', path)
        return None
    model = joblib.load(path)
    # if classifier
    if hasattr(model, 'predict_proba'):
        pred = model.predict(sample)[0]
        prob = float(model.predict_proba(sample)[:,1][0])
        return {'admit':bool(pred),'probability':prob,'model':best_model_name}
    else:
        # assume regression
        sample_scaled = scaler.transform(sample)
        prob = float(model.predict(sample_scaled)[0])
        return {'admit': bool(prob>=0.5),'probability':prob,'model':best_model_name}

# -----------------------------------------------------------------------------
# Cell 10: Next steps & recommendations
# -----------------------------------------------------------------------------
print('\nNext steps:')
print('- Inspect outputs/model_comparison_results.csv to see model metrics side-by-side.')
print('- If XGBoost was installed, it is included in the comparison and often performs best on tabular data.')
print('- IMPORTANT: current training data is synthetic (sampled around cutoffs) → get real student admission outcomes to validate models properly.')
print('- After selecting best model, integrate into your web app (Streamlit/Flask) with encoders saved (joblib).')
print('- Optionally run feature importance (for tree models) to discover which features matter most.')
    

AP loaded: True TS loaded: True JEE loaded: True
AP shape: (3078, 31)
TS shape: (2986, 30)
JEE shape: (64958, 14)
Cutoffs table shape: (180174, 5)

--- Dataset analysis summary ---
Total cutoff entries: 180174
Unique exams: 3 -> ['AP' 'TS' 'JEE']
Unique colleges: 946
Unique branches: 218
Categories sample: ['ESTD' 'OC_BOYS' 'OC_GIRLS' 'SC_BOYS' 'SC_GIRLS' 'ST_BOYS' 'ST_GIRLS'
 'BCA_BOYS' 'BCA_GIRLS' 'BCB_BOYS']

Cutoff rank descriptive stats:
count    1.801740e+05
mean     6.646587e+04
std      6.204989e+04
min      0.000000e+00
25%      4.081000e+03
50%      5.600900e+04
75%      1.198945e+05
max      1.144790e+06
Name: cutoff, dtype: float64
Synthetic students shape: (900865, 7)

Training and cross-validating: LogisticRegression
CV Accuracy mean: 0.5895 ± 0.0046
CV ROC-AUC mean: 0.6156 ± 0.0040
Test Acc: 0.5924, Prec: 0.5824, Rec: 0.6816, F1: 0.6281, ROC: 0.6183

Training and cross-validating: RandomForest




CV Accuracy mean: 0.6766 ± 0.0035
CV ROC-AUC mean: 0.7515 ± 0.0021
Test Acc: 0.6974, Prec: 0.6997, Rec: 0.7019, F1: 0.7008, ROC: 0.7847

Training and cross-validating: MLP
CV Accuracy mean: 0.6077 ± 0.0209
CV ROC-AUC mean: 0.6533 ± 0.0152
Test Acc: 0.6464, Prec: 0.6327, Rec: 0.7147, F1: 0.6712, ROC: 0.7072

Training and cross-validating: XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


CV Accuracy mean: 0.6979 ± 0.0023
CV ROC-AUC mean: 0.7761 ± 0.0009


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Test Acc: 0.7093, Prec: 0.6897, Rec: 0.7715, F1: 0.7283, ROC: 0.7888

Training regression models (probability estimation):
Model: Huber
CV MSE mean (sample): 0.040114879809406125
Test MSE: 0.043959, R2: -0.0813
Model comparison saved to outputs/model_comparison_results.csv

Best model by test ROC/R2: XGBoost score: 0.788782283338432

Next steps:
- Inspect outputs/model_comparison_results.csv to see model metrics side-by-side.
- If XGBoost was installed, it is included in the comparison and often performs best on tabular data.
- IMPORTANT: current training data is synthetic (sampled around cutoffs) → get real student admission outcomes to validate models properly.
- After selecting best model, integrate into your web app (Streamlit/Flask) with encoders saved (joblib).
- Optionally run feature importance (for tree models) to discover which features matter most.


In [2]:
# =============================================================================
# Admission Prediction Notebook: XGBoost + Ensemble + Rank_ratio
# =============================================================================

# --- Cell 1: Imports ---
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, mean_squared_error, r2_score
)
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import joblib

# --- Cell 2: File paths (update with your real dataset paths) ---
AP_FILE = "/home/me3-lab/Project/data/AP_EAMCET_Cleaned_Merged.csv"
TS_FILE = "/home/me3-lab/Project/data/TSEAMCET_2021_2022_2023_merged_clean.csv"
JEE_FILE = "/home/me3-lab/Project/data/JEE_data.csv"
OUTPUT_DIR = "outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- Cell 3: Load Data ---
def load_csv(path):
    try:
        return pd.read_csv(path)
    except:
        return None

ap_df = load_csv(AP_FILE)
ts_df = load_csv(TS_FILE)
jee_df = load_csv(JEE_FILE)

# --- Cell 4: Build cutoff table ---
records = []

if ap_df is not None:
    cat_cols = [c for c in ap_df.columns if any(k in c.upper() for k in ['BOYS','GIRLS','OC','BC','SC','ST','EWS'])]
    for _, row in ap_df.iterrows():
        for c in cat_cols:
            try: cutoff = float(row[c])
            except: continue
            records.append({
                'exam':'AP','college':row.get('NAME_OF_THE_INSTITUTION',None),
                'branch':row.get('branch_code',row.get('branch',None)),
                'category':c,'cutoff':cutoff
            })

if ts_df is not None:
    cat_cols = [c for c in ts_df.columns if any(k in c.lower() for k in ['boys','girls','oc','bc','sc','st','ews'])]
    for _, row in ts_df.iterrows():
        for c in cat_cols:
            try: cutoff = float(row[c])
            except: continue
            records.append({
                'exam':'TS','college':row.get('institute_name',None),
                'branch':row.get('branch',None),'category':c,'cutoff':cutoff
            })

if jee_df is not None and 'closing_rank' in jee_df.columns:
    for _, row in jee_df.iterrows():
        try: cutoff = float(row['closing_rank'])
        except: continue
        records.append({
            'exam':'JEE','college':row.get('institute_short',None),
            'branch':row.get('program_name',None),
            'category':row.get('category',None),'cutoff':cutoff
        })

cutoffs_df = pd.DataFrame(records)

# --- Cell 5: Synthetic student dataset ---
np.random.seed(42)
student_rows = []
for _, row in cutoffs_df.iterrows():
    cutoff = row['cutoff']
    if cutoff <= 0 or np.isnan(cutoff): continue
    sample_ranks = [
        int(max(1, cutoff*0.5 + np.random.randint(-50,50))),
        int(max(1, cutoff*0.8 + np.random.randint(-30,30))),
        int(max(1, cutoff + np.random.randint(-20,20))),
        int(max(1, cutoff*1.1 + np.random.randint(-30,60))),
        int(max(1, cutoff*1.4 + np.random.randint(-50,100)))
    ]
    for r in sample_ranks:
        admit = 1 if r <= cutoff else 0
        if np.random.rand() < 0.03: admit = 1 - admit
        student_rows.append({
            'exam': row['exam'],'college': row['college'],'branch': row['branch'],
            'category': str(row['category']),'rank': r,'admit': admit,
            'rank_ratio': r/cutoff
        })

students = pd.DataFrame(student_rows)

# --- Cell 6: Encode features ---
le_exam = LabelEncoder(); students['exam_enc']=le_exam.fit_transform(students['exam'])
le_cat = LabelEncoder(); students['cat_enc']=le_cat.fit_transform(students['category'])
le_branch = LabelEncoder(); students['branch_enc']=le_branch.fit_transform(students['branch'].astype(str))

X = students[['rank_ratio','exam_enc','cat_enc','branch_enc']]
y = students['admit']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# --- Cell 7: Scale numeric features ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Cell 8: Define models ---
xgb_model = XGBClassifier(
    n_estimators=300, max_depth=6, learning_rate=0.1,
    subsample=0.8, colsample_bytree=0.8,
    eval_metric='logloss', random_state=42
)

rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
log_model = LogisticRegression(max_iter=1000)

stack_model = StackingClassifier(
    estimators=[('xgb', xgb_model), ('rf', rf_model), ('log', log_model)],
    final_estimator=LogisticRegression(),
    cv=5, n_jobs=2, passthrough=True
)

# --- Cell 9: Train models ---
print("Training XGBoost...")
xgb_model.fit(X_train, y_train)

print("Training Stacking Ensemble...")
stack_model.fit(X_train, y_train)

# --- Cell 10: Evaluation ---
def evaluate_model(model, X_test, y_test, name="Model"):
    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:,1]
    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds)
    rec = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    roc = roc_auc_score(y_test, probs)
    mse = mean_squared_error(y_test, probs)
    r2 = r2_score(y_test, probs)
    print(f"\n{name} Metrics:")
    print(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, "
          f"F1: {f1:.4f}, ROC-AUC: {roc:.4f}, MSE: {mse:.4f}, R²: {r2:.4f}")
    return {'accuracy': acc, 'precision': prec, 'recall': rec,
            'f1': f1, 'roc_auc': roc, 'mse': mse, 'r2': r2}

xgb_metrics = evaluate_model(xgb_model, X_test, y_test, "XGBoost")
stack_metrics = evaluate_model(stack_model, X_test, y_test, "Stacking Ensemble")

# --- Cell 11: Save best model ---
best_model = xgb_model if xgb_metrics['roc_auc'] >= stack_metrics['roc_auc'] else stack_model
joblib.dump(best_model, os.path.join(OUTPUT_DIR,'best_admission_model.joblib'))
print("\nBest model saved:", best_model)

# --- Cell 12: Prediction function ---
def predict_admission(rank, exam, category, branch):
    try: exam_enc = le_exam.transform([exam])[0]
    except: exam_enc = 0
    try: cat_enc = le_cat.transform([category])[0]
    except: cat_enc = 0
    try: branch_enc = le_branch.transform([branch])[0]
    except: branch_enc = 0

    subset = cutoffs_df[(cutoffs_df.exam==exam)&(cutoffs_df.branch==branch)&(cutoffs_df.category==category)]
    cutoff = subset['cutoff'].values[0] if len(subset)>0 else rank
    rank_ratio = rank / cutoff

    sample = pd.DataFrame([[rank_ratio, exam_enc, cat_enc, branch_enc]],
                          columns=['rank_ratio','exam_enc','cat_enc','branch_enc'])
    prob = best_model.predict_proba(sample)[:,1][0]
    admit = 1 if prob >= 0.5 else 0
    return {'admit': bool(admit), 'probability': prob}


Training XGBoost...
Training Stacking Ensemble...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



XGBoost Metrics:
Accuracy: 0.9672, Precision: 0.9662, Recall: 0.9689, F1: 0.9675, ROC-AUC: 0.9698, MSE: 0.0307, R²: 0.8774

Stacking Ensemble Metrics:
Accuracy: 0.9698, Precision: 0.9700, Recall: 0.9703, F1: 0.9701, ROC-AUC: 0.9698, MSE: 0.0293, R²: 0.8827

Best model saved: StackingClassifier(cv=5,
                   estimators=[('xgb',
                                XGBClassifier(base_score=None, booster=None,
                                              callbacks=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=0.8, device=None,
                                              early_stopping_rounds=None,
                                              enable_categorical=False,
                                              eval_metric='logloss',
                                              feature_types=None,
                   

In [3]:
import joblib, os

# Save encoders & scaler
joblib.dump(le_exam, os.path.join(OUTPUT_DIR, "le_exam.joblib"))
joblib.dump(le_cat, os.path.join(OUTPUT_DIR, "le_cat.joblib"))
joblib.dump(le_branch, os.path.join(OUTPUT_DIR, "le_branch.joblib"))
joblib.dump(scaler, os.path.join(OUTPUT_DIR, "scaler.joblib"))

print("Encoders and scaler saved to outputs/")


Encoders and scaler saved to outputs/


In [8]:
from sklearn.preprocessing import LabelEncoder
import joblib

# Create LabelEncoder for college
le_college = LabelEncoder()
students['college_enc'] = le_college.fit_transform(students['college'].astype(str))

# Update X to include college_enc
X = students[['rank_ratio','exam_enc','cat_enc','branch_enc','college_enc']]

# Save the encoder
joblib.dump(le_college, 'outputs/le_college.joblib')


['outputs/le_college.joblib']

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

y = students['admit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Optionally scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train stacking ensemble
xgb_model = XGBClassifier(n_estimators=300, max_depth=6, eval_metric='logloss', random_state=42)
rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
log_model = LogisticRegression(max_iter=1000)

stack_model = StackingClassifier(
    estimators=[('xgb', xgb_model), ('rf', rf_model), ('log', log_model)],
    final_estimator=LogisticRegression(),
    cv=5, n_jobs=2, passthrough=True
)

stack_model.fit(X_train, y_train)

# Save best model
joblib.dump(stack_model, 'outputs/best_admission_model.joblib')


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


['outputs/best_admission_model.joblib']

In [10]:
import os

os.makedirs('outputs', exist_ok=True)  # make sure folder exists
cutoffs_df.to_csv('outputs/cutoffs.csv', index=False)


In [None]:
# =============================================================================
# Admission Prediction Notebook: Corrected Version
# =============================================================================

import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.utils import resample
import joblib

# --- Directories ---
DATA_DIR = "/home/me3-lab/Project/data"
OUTPUT_DIR = "outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- Load datasets ---
ap_df = pd.read_csv(os.path.join(DATA_DIR, "/home/me3-lab/Project/data/AP_EAMCET_Cleaned_Merged.csv"))
ts_df = pd.read_csv(os.path.join(DATA_DIR, "/home/me3-lab/Project/data/TSEAMCET_2021_2022_2023_merged_clean.csv"))
jee_df = pd.read_csv(os.path.join(DATA_DIR, "/home/me3-lab/Project/data/JEE_data.csv"))

# --- Build cutoff table ---
records = []

def add_cutoffs(df, exam_col_name, college_col_name, branch_col_name, cat_cols, exam_name):
    for _, row in df.iterrows():
        for c in cat_cols:
            try:
                cutoff = float(row[c])
            except:
                continue
            records.append({
                'exam': exam_name,
                'college': row.get(college_col_name),
                'branch': row.get(branch_col_name),
                'category': c,
                'cutoff': cutoff
            })

# AP
cat_cols = [c for c in ap_df.columns if any(k in c.upper() for k in ['BOYS','GIRLS','OC','BC','SC','ST','EWS'])]
add_cutoffs(ap_df, 'exam','NAME_OF_THE_INSTITUTION','branch',cat_cols,'AP')

# TS
cat_cols = [c for c in ts_df.columns if any(k in c.lower() for k in ['boys','girls','oc','bc','sc','st','ews'])]
add_cutoffs(ts_df, 'exam','institute_name','branch',cat_cols,'TS')

# JEE
if 'closing_rank' in jee_df.columns:
    for _, row in jee_df.iterrows():
        try: cutoff = float(row['closing_rank'])
        except: continue
        records.append({
            'exam':'JEE',
            'college':row.get('institute_short', None),
            'branch':row.get('program_name', None),
            'category':row.get('category', None),
            'cutoff': cutoff
        })

cutoffs_df = pd.DataFrame(records)
cutoffs_df.to_csv(os.path.join(OUTPUT_DIR,'cutoffs.csv'), index=False)

# --- Generate synthetic student data for training ---
student_rows = []
np.random.seed(42)
for _, row in cutoffs_df.iterrows():
    cutoff = row['cutoff']
    if np.isnan(cutoff) or cutoff <= 0: continue
    # generate multiple ranks around cutoff
    sample_ranks = [
        int(max(1, cutoff*0.5 + np.random.randint(-50,50))),
        int(max(1, cutoff*0.8 + np.random.randint(-30,30))),
        int(max(1, cutoff + np.random.randint(-20,20))),
        int(max(1, cutoff*1.1 + np.random.randint(-30,60))),
        int(max(1, cutoff*1.4 + np.random.randint(-50,100)))
    ]
    for r in sample_ranks:
        admit = 1 if r <= cutoff else 0
        if np.random.rand() < 0.03: admit = 1 - admit
        student_rows.append({
            'exam': row['exam'],
            'college': row['college'],
            'branch': row['branch'],
            'category': str(row['category']),
            'rank': r,
            'admit': admit,
            'rank_ratio': r/cutoff
        })

students = pd.DataFrame(student_rows)

# --- Encode categorical features ---
le_exam = LabelEncoder(); students['exam_enc'] = le_exam.fit_transform(students['exam'].astype(str))
le_cat = LabelEncoder(); students['cat_enc'] = le_cat.fit_transform(students['category'].astype(str))
le_branch = LabelEncoder(); students['branch_enc'] = le_branch.fit_transform(students['branch'].astype(str))
le_college = LabelEncoder(); students['college_enc'] = le_college.fit_transform(students['college'].astype(str))

X = students[['rank_ratio','exam_enc','cat_enc','branch_enc','college_enc']]
y = students['admit']

# --- Balance classes ---
df1 = students[students.admit==1]
df0 = students[students.admit==0]
df0_up = resample(df0, replace=True, n_samples=len(df1), random_state=42)
students_balanced = pd.concat([df1, df0_up])
X = students_balanced[['rank_ratio','exam_enc','cat_enc','branch_enc','college_enc']]
y = students_balanced['admit']

# --- Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# --- Scale rank_ratio only (optional) ---
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled['rank_ratio'] = scaler.fit_transform(X_train[['rank_ratio']])
X_test_scaled['rank_ratio'] = scaler.transform(X_test[['rank_ratio']])

# --- Define models ---
xgb_model = XGBClassifier(
    n_estimators=300, max_depth=6, learning_rate=0.1,
    subsample=0.8, colsample_bytree=0.8,
    eval_metric='logloss', random_state=42
)
rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
log_model = LogisticRegression(max_iter=1000)

stack_model = StackingClassifier(
    estimators=[('xgb', xgb_model), ('rf', rf_model), ('log', log_model)],
    final_estimator=LogisticRegression(),
    cv=5, passthrough=True
)

# --- Calibrate probabilities ---
stack_model = CalibratedClassifierCV(stack_model, cv=5)
stack_model.fit(X_train_scaled, y_train)

# --- Evaluate ---
def evaluate_model(model, X_test, y_test):
    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:,1]
    acc = accuracy_score(y_test, preds)
    roc = roc_auc_score(y_test, probs)
    print(f"Accuracy: {acc:.4f}, ROC-AUC: {roc:.4f}")

evaluate_model(stack_model, X_test_scaled, y_test)

# --- Save model and encoders ---
joblib.dump(stack_model, os.path.join(OUTPUT_DIR,'best_admission_model.joblib'))
joblib.dump(le_exam, os.path.join(OUTPUT_DIR,'le_exam.joblib'))
joblib.dump(le_cat, os.path.join(OUTPUT_DIR,'le_cat.joblib'))
joblib.dump(le_branch, os.path.join(OUTPUT_DIR,'le_branch.joblib'))
joblib.dump(le_college, os.path.join(OUTPUT_DIR,'le_college.joblib'))

print("Training complete. All outputs saved in 'outputs/' folder.")


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to sca