In [None]:
import pandas as pd
import numpy as np
import sklearn as sklearn
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import joblib
import logging
import multiprocessing as mp
import warnings
warnings.filterwarnings('ignore')

# Setup logging
logging.basicConfig(filename='prediction.log', level=logging.INFO,
                    format='%(asctime)s - %(message)s')

## Clean column names (remove spaces, newlines)

In [None]:
df.columns = [col.strip().replace('\n', '') for col in df.columns]
print("Columns:", df.columns.tolist())


## Dynamic university detection (Enhancement 2)

In [None]:
universities = [col.split(' Score')[0] for col in df.columns if ' Score' in col and col not in ['Matriculation Marks', 'Intermediate Marks']]
print("Detected universities:", universities)

## Define max_marks (aligned with cleaned names)

In [None]:
max_marks = {
    'Matriculation Marks': 1100, 'Intermediate Marks': 550,
    'UET Score': 400, 'NUST Score': 200, 'GIKI Score': 100,
    'PIEAS Score': 100, 'PUCIT Score': 100, 'FAST Score': 100,
    'COMSATS Score': 100, 'ITU Score': 50
}
feature_cols = ['Matriculation Marks_pct', 'Intermediate Marks_pct']

# Normalize features

In [None]:
for col in max_marks.keys():
    if col in df.columns:
        df[f'{col}_pct'] = np.clip(df[col] / max_marks[col] * 100, 0, 100)
        logging.info(f"Normalized {col} to {col}_pct")
    else:
        logging.warning(f"Column {col} not found in dataset")

## Verify all expected columns exist

In [None]:
expected_cols = feature_cols + [f'{uni} Score_pct' for uni in universities] + [f'{uni} Aggregate' for uni in universities]
missing = [col for col in expected_cols if col not in df.columns]
if missing:
    logging.error(f"Missing columns after normalization: {missing}")
    raise ValueError(f"Normalization failed: {missing}")