In [None]:
import pandas as pd
import numpy as np
import sklearn as sklearn
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import joblib
import logging
import multiprocessing as mp
import warnings
warnings.filterwarnings('ignore')

# Setup logging
logging.basicConfig(filename='prediction.log', level=logging.INFO,
                    format='%(asctime)s - %(message)s')

## Clean column names (remove spaces, newlines)

In [None]:
df.columns = [col.strip().replace('\n', '') for col in df.columns]
print("Columns:", df.columns.tolist())


## Dynamic university detection (Enhancement 2)

In [None]:
universities = [col.split(' Score')[0] for col in df.columns if ' Score' in col and col not in ['Matriculation Marks', 'Intermediate Marks']]
print("Detected universities:", universities)

## Define max_marks (aligned with cleaned names)

In [None]:
max_marks = {
    'Matriculation Marks': 1100, 'Intermediate Marks': 550,
    'UET Score': 400, 'NUST Score': 200, 'GIKI Score': 100,
    'PIEAS Score': 100, 'PUCIT Score': 100, 'FAST Score': 100,
    'COMSATS Score': 100, 'ITU Score': 50
}
feature_cols = ['Matriculation Marks_pct', 'Intermediate Marks_pct']

# Normalize features

In [None]:
for col in max_marks.keys():
    if col in df.columns:
        df[f'{col}_pct'] = np.clip(df[col] / max_marks[col] * 100, 0, 100)
        logging.info(f"Normalized {col} to {col}_pct")
    else:
        logging.warning(f"Column {col} not found in dataset")

## Verify all expected columns exist

In [None]:
expected_cols = feature_cols + [f'{uni} Score_pct' for uni in universities] + [f'{uni} Aggregate' for uni in universities]
missing = [col for col in expected_cols if col not in df.columns]
if missing:
    logging.error(f"Missing columns after normalization: {missing}")
    raise ValueError(f"Normalization failed: {missing}")

# Verify all expected columns exist

In [None]:

expected_cols = feature_cols + [f'{uni} Score_pct' for uni in universities] + [f'{uni} Aggregate' for uni in universities]
missing = [col for col in expected_cols if col not in df.columns]
if missing:
    logging.error(f"Missing columns after normalization: {missing}")
    raise ValueError(f"Normalization failed: {missing}")


# Data quality checks (Enhancement 9)

In [None]:

for uni in universities:
    invalid_scores = df[df[f'{uni} Score'] > max_marks[f'{uni} Score']].index
    if not invalid_scores.empty:
        df.loc[invalid_scores, f'{uni} Score'] = max_marks[f'{uni} Score']
        df.loc[invalid_scores, f'{uni} Score_pct'] = 100
        logging.info(f"Capped {len(invalid_scores)} {uni} Scores at {max_marks[f'{uni} Score']}")
    low_agg_high_field = df[(df[f'{uni} Aggregate'] < 60) & (df[f'Program Selected at {uni}'] != 'Low Competitive Fields')].index
    if not low_agg_high_field.empty:
        logging.warning(f"Inconsistent data for {uni}: {len(low_agg_high_field)} rows")

# Replace 'Not Selected in Any Field'

In [None]:

for uni in universities:
    df.loc[df[f'Program Selected at {uni}'] == 'Not Selected in Any Field', f'Program Selected at {uni}'] = 'Low Competitive Fields'

## Unique Program in each University

In [3]:
uni_data = {}
for uni in universities:
    features = feature_cols + [f'{uni} Score_pct', f'{uni} Aggregate']
    targets = [f'Selected at {uni}?', f'Program Selected at {uni}']
    missing_cols = [col for col in features + targets if col not in df.columns]
    if missing_cols:
        logging.error(f"Error: {missing_cols} not found for {uni}")
        continue
    uni_df = df[features + targets].copy()
    uni_data[uni] = {
        'X': uni_df[features],
        'y_admission': uni_df[f'Selected at {uni}?'],
        'y_program': uni_df[f'Program Selected at {uni}']
    }
    print(f"{uni} unique programs:", uni_data[uni]['y_program'].nunique())
if not uni_data:
    logging.error("No university data prepared—check dataset columns")
    raise ValueError("uni_data is empty")

NameError: name 'universities' is not defined

## Generating Synthetic Data for Zero Admission Cases

In [None]:
def generate_synthetic_zeros(df, uni, n_samples=200):
    synth_df = pd.DataFrame({
        'Matriculation Marks_pct': np.concatenate([np.random.uniform(10, 30, n_samples//2), np.random.uniform(30, 50, n_samples//2)]),
        'Intermediate Marks_pct': np.concatenate([np.random.uniform(10, 30, n_samples//2), np.random.uniform(30, 50, n_samples//2)]),
        f'{uni} Score_pct': np.concatenate([np.random.uniform(10, 30, n_samples//2), np.random.uniform(30, 50, n_samples//2)]),
        f'{uni} Aggregate': np.concatenate([np.random.uniform(20, 40, n_samples//2), np.random.uniform(40, 60, n_samples//2)])
    })
    synth_df[f'Selected at {uni}?'] = 0
    return synth_df

## Training a Machine Learning Model for Admission Predictions

In [None]:
def train_admission_model(uni):
    if uni not in uni_data:
        logging.error(f"No data for {uni}")
        return None
    real_data = uni_data[uni]['X'].copy()
    real_data[f'Selected at {uni}?'] = uni_data[uni]['y_admission']
    if real_data.isnull().values.any():
        real_data.fillna(real_data.median(), inplace=True)
        logging.info(f"Imputed NaNs for {uni}")
    synth_data = generate_synthetic_zeros(real_data, uni)
    model_data = pd.concat([real_data, synth_data], ignore_index=True)

    X = model_data.drop(columns=[f'Selected at {uni}?'])
    y = model_data[f'Selected at {uni}?']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    cv_scores = cross_val_score(model, X, y, cv=5, scoring='f1')
    logging.info(f"{uni} Admission CV F1: {cv_scores.mean():.2f} (+/- {cv_scores.std():.2f})")

    y_pred = model.predict(X_test)
    print(f"{uni} Admission - Accuracy: {accuracy_score(y_test, y_pred):.2f}, F1: {f1_score(y_test, y_pred):.2f}")
    return model

with mp.Pool(processes=mp.cpu_count()) as pool:
    admission_models_list = pool.map(train_admission_model, universities)
admission_models = {uni: model for uni, model in zip(universities, admission_models_list) if model is not None}

for uni, model in admission_models.items():
    joblib.dump(model, f'admission_model_{uni}.pkl')