In [None]:
import pandas as pd
import numpy as np
import sklearn as sklearn
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import joblib
import logging
import multiprocessing as mp
import warnings
warnings.filterwarnings('ignore')

# Setup logging
logging.basicConfig(filename='prediction.log', level=logging.INFO,
                    format='%(asctime)s - %(message)s')

## Clean column names (remove spaces, newlines)

In [None]:
df.columns = [col.strip().replace('\n', '') for col in df.columns]
print("Columns:", df.columns.tolist())


## Dynamic university detection (Enhancement 2)

In [None]:
universities = [col.split(' Score')[0] for col in df.columns if ' Score' in col and col not in ['Matriculation Marks', 'Intermediate Marks']]
print("Detected universities:", universities)

## Define max_marks (aligned with cleaned names)

In [None]:
max_marks = {
    'Matriculation Marks': 1100, 'Intermediate Marks': 550,
    'UET Score': 400, 'NUST Score': 200, 'GIKI Score': 100,
    'PIEAS Score': 100, 'PUCIT Score': 100, 'FAST Score': 100,
    'COMSATS Score': 100, 'ITU Score': 50
}
feature_cols = ['Matriculation Marks_pct', 'Intermediate Marks_pct']

# Normalize features

In [None]:
for col in max_marks.keys():
    if col in df.columns:
        df[f'{col}_pct'] = np.clip(df[col] / max_marks[col] * 100, 0, 100)
        logging.info(f"Normalized {col} to {col}_pct")
    else:
        logging.warning(f"Column {col} not found in dataset")

## Verify all expected columns exist

In [None]:
expected_cols = feature_cols + [f'{uni} Score_pct' for uni in universities] + [f'{uni} Aggregate' for uni in universities]
missing = [col for col in expected_cols if col not in df.columns]
if missing:
    logging.error(f"Missing columns after normalization: {missing}")
    raise ValueError(f"Normalization failed: {missing}")

# Verify all expected columns exist

In [None]:

expected_cols = feature_cols + [f'{uni} Score_pct' for uni in universities] + [f'{uni} Aggregate' for uni in universities]
missing = [col for col in expected_cols if col not in df.columns]
if missing:
    logging.error(f"Missing columns after normalization: {missing}")
    raise ValueError(f"Normalization failed: {missing}")


# Data quality checks (Enhancement 9)

In [None]:

for uni in universities:
    invalid_scores = df[df[f'{uni} Score'] > max_marks[f'{uni} Score']].index
    if not invalid_scores.empty:
        df.loc[invalid_scores, f'{uni} Score'] = max_marks[f'{uni} Score']
        df.loc[invalid_scores, f'{uni} Score_pct'] = 100
        logging.info(f"Capped {len(invalid_scores)} {uni} Scores at {max_marks[f'{uni} Score']}")
    low_agg_high_field = df[(df[f'{uni} Aggregate'] < 60) & (df[f'Program Selected at {uni}'] != 'Low Competitive Fields')].index
    if not low_agg_high_field.empty:
        logging.warning(f"Inconsistent data for {uni}: {len(low_agg_high_field)} rows")

# Replace 'Not Selected in Any Field'

In [None]:

for uni in universities:
    df.loc[df[f'Program Selected at {uni}'] == 'Not Selected in Any Field', f'Program Selected at {uni}'] = 'Low Competitive Fields'

## Unique Program in each University

In [3]:
uni_data = {}
for uni in universities:
    features = feature_cols + [f'{uni} Score_pct', f'{uni} Aggregate']
    targets = [f'Selected at {uni}?', f'Program Selected at {uni}']
    missing_cols = [col for col in features + targets if col not in df.columns]
    if missing_cols:
        logging.error(f"Error: {missing_cols} not found for {uni}")
        continue
    uni_df = df[features + targets].copy()
    uni_data[uni] = {
        'X': uni_df[features],
        'y_admission': uni_df[f'Selected at {uni}?'],
        'y_program': uni_df[f'Program Selected at {uni}']
    }
    print(f"{uni} unique programs:", uni_data[uni]['y_program'].nunique())
if not uni_data:
    logging.error("No university data prepared—check dataset columns")
    raise ValueError("uni_data is empty")

NameError: name 'universities' is not defined

## Generating Synthetic Data for Zero Admission Cases

In [None]:
def generate_synthetic_zeros(df, uni, n_samples=200):
    synth_df = pd.DataFrame({
        'Matriculation Marks_pct': np.concatenate([np.random.uniform(10, 30, n_samples//2), np.random.uniform(30, 50, n_samples//2)]),
        'Intermediate Marks_pct': np.concatenate([np.random.uniform(10, 30, n_samples//2), np.random.uniform(30, 50, n_samples//2)]),
        f'{uni} Score_pct': np.concatenate([np.random.uniform(10, 30, n_samples//2), np.random.uniform(30, 50, n_samples//2)]),
        f'{uni} Aggregate': np.concatenate([np.random.uniform(20, 40, n_samples//2), np.random.uniform(40, 60, n_samples//2)])
    })
    synth_df[f'Selected at {uni}?'] = 0
    return synth_df

## Training a Machine Learning Model for Admission Predictions

In [None]:
def train_admission_model(uni):
    if uni not in uni_data:
        logging.error(f"No data for {uni}")
        return None
    real_data = uni_data[uni]['X'].copy()
    real_data[f'Selected at {uni}?'] = uni_data[uni]['y_admission']
    if real_data.isnull().values.any():
        real_data.fillna(real_data.median(), inplace=True)
        logging.info(f"Imputed NaNs for {uni}")
    synth_data = generate_synthetic_zeros(real_data, uni)
    model_data = pd.concat([real_data, synth_data], ignore_index=True)

    X = model_data.drop(columns=[f'Selected at {uni}?'])
    y = model_data[f'Selected at {uni}?']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    cv_scores = cross_val_score(model, X, y, cv=5, scoring='f1')
    logging.info(f"{uni} Admission CV F1: {cv_scores.mean():.2f} (+/- {cv_scores.std():.2f})")

    y_pred = model.predict(X_test)
    print(f"{uni} Admission - Accuracy: {accuracy_score(y_test, y_pred):.2f}, F1: {f1_score(y_test, y_pred):.2f}")
    return model

with mp.Pool(processes=mp.cpu_count()) as pool:
    admission_models_list = pool.map(train_admission_model, universities)
admission_models = {uni: model for uni, model in zip(universities, admission_models_list) if model is not None}

for uni, model in admission_models.items():
    joblib.dump(model, f'admission_model_{uni}.pkl')

## Training a Machine Learning Model for Field  Predictions

In [None]:
def train_program_model(uni):
    if uni not in uni_data:
        logging.error(f"No data for {uni}")
        return None
    X = uni_data[uni]['X']
    y = uni_data[uni]['y_program']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    model = RandomForestClassifier(n_estimators=200, random_state=42)
    model.fit(X_train, y_train)

    cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    logging.info(f"{uni} Program CV Accuracy: {cv_scores.mean():.2f} (+/- {cv_scores.std():.2f})")

    y_pred = model.predict(X_test)
    print(f"{uni} Program - Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    return model

with mp.Pool(processes=mp.cpu_count()) as pool:
    program_models_list = pool.map(train_program_model, universities)
program_models = {uni: model for uni, model in zip(universities, program_models_list) if model is not None}

for uni, model in program_models.items():
    joblib.dump(model, f'program_model_{uni}.pkl')

##  Loading Models and Aggregate Formulas

In [None]:
admission_models_loaded = {uni: joblib.load(f'admission_model_{uni}.pkl') for uni in universities}
program_models_loaded = {uni: joblib.load(f'program_model_{uni}.pkl') for uni in universities}

aggregate_formulas = {
    'UET': lambda matric_pct, inter_pct, test_pct: (matric_pct * 0.17) + (inter_pct * 0.50) + (test_pct * 0.33),  # Updated UET formula
    'FAST': lambda matric_pct, inter_pct, test_pct: (matric_pct * 0.1) + (inter_pct * 0.4) + (test_pct * 0.5),
    'NUST': lambda matric_pct, inter_pct, test_pct: (matric_pct * 0.1) + (inter_pct * 0.15) + (test_pct * 0.75),
    'GIKI': lambda matric_pct, inter_pct, test_pct: (test_pct * 0.85) + (inter_pct * 0.10) + (5),
    'PIEAS': lambda matric_pct, inter_pct, test_pct: (matric_pct * 0.15) + (inter_pct * 0.25) + (test_pct * 0.6),
    'PUCIT': lambda matric_pct, inter_pct, test_pct: ((0.25 * (matric_pct * 11 + inter_pct * 5.5)) / (0.25 * (1100 + 570)) * 75) + (test_pct * 0.25),
    'COMSATS': lambda matric_pct, inter_pct, test_pct: (matric_pct * 0.1) + (inter_pct * 0.4) + (test_pct * 0.5),
    'ITU': lambda matric_pct, inter_pct, test_pct: (test_pct * 0.5) + (inter_pct * 0.35) + (matric_pct * 0.15)
}


## Defining the Main Function and Validating Inputs

In [None]:
def predict_admission_programs(input_dict):
    for key, val in input_dict.items():
        if 'Marks' in key or 'Score' in key:
            max_val = max_marks.get(key, 100)
            if not (0 <= val <= max_val):
                logging.error(f"Invalid input: {key} = {val} exceeds {max_val}")
                return {uni: f"Error: {key} must be between 0 and {max_val}" for uni in universities}

    input_normalized = {
        'Matriculation Marks_pct': input_dict['Matriculation Marks'] / 1100 * 100,
        'Intermediate Marks_pct': input_dict['Intermediate Marks'] / 550 * 100
    }
    aggregates = {}
    results = {}  # Only include tested universities


## Processing Each University and Predicting Admissions

In [None]:
    for uni in universities:
        test_flag = input_dict.get(f'Have You Given {uni} Test', 0)
        if test_flag == 1:  # Only process tested universities
            if f'{uni} Score' not in input_dict:
                results[uni] = f"Error: {uni} Score missing despite test taken"
                aggregates[uni] = None
            else:
                test_score = input_dict[f'{uni} Score']
                input_normalized[f'{uni} Score_pct'] = test_score / max_marks[f'{uni} Score'] * 100
                formula = aggregate_formulas.get(uni, aggregate_formulas['UET'])
                aggregates[uni] = formula(input_normalized['Matriculation Marks_pct'],
                                        input_normalized['Intermediate Marks_pct'],
                                        input_normalized[f'{uni} Score_pct'])

                uni_input = pd.DataFrame([[
                    input_normalized['Matriculation Marks_pct'],
                    input_normalized['Intermediate Marks_pct'],
                    input_normalized[f'{uni} Score_pct'],
                    aggregates[uni]
                ]], columns=feature_cols + [f'{uni} Score_pct', f'{uni} Aggregate'])

                admission_pred = admission_models_loaded[uni].predict(uni_input)[0]
                if admission_pred == 0:
                    results[uni] = "Selected: No, Programs: None"
                else:
                    program_pred = program_models_loaded[uni].predict(uni_input)[0]
                    results[uni] = f"Selected: Yes, Program: {program_pred}"
                results[uni] += f"\n     Aggregate: {aggregates[uni]:.2f}"

    logging.info(f"Input: {input_dict}, Results: {results}")
    return results


## Collecting and Validating Basic Student Details

In [None]:
def main():
    print("Enter student details:")
    input_dict = {}
    try:
        # Validate Matriculation and Intermediate Marks
        while True:
            try:
                matric = float(input("Matriculation Marks (out of 1100): "))
                if 0 <= matric <= 1100:
                    input_dict['Matriculation Marks'] = matric
                    break
                else:
                    print("Error: Matriculation Marks must be between 0 and 1100")
            except ValueError:
                print("Error: Please enter a valid number")

        while True:
            try:
                inter = float(input("Intermediate Marks (out of 550): "))
                if 0 <= inter <= 550:
                    input_dict['Intermediate Marks'] = inter
                    break
                else:
                    print("Error: Intermediate Marks must be between 550")
            except ValueError:
                print("Error: Please enter a valid number")


## Collecting and Validating University Test Information

In [None]:
        # Validate test flags with inline guidance
        for uni in universities:
            while True:
                try:
                    prompt = f"Have you given {uni} test? (Enter 1 for Yes, 0 for No): "
                    test_taken = int(input(prompt))
                    if test_taken in [0, 1]:
                        input_dict[f'Have You Given {uni} Test'] = test_taken
                        break
                    else:
                        print(f"Please enter 1 (Yes) or 0 (No) only - invalid input: {test_taken}")
                except ValueError as e:
                    print(f"Please enter 1 (Yes) or 0 (No) only - invalid input: {e}")

            if input_dict[f'Have You Given {uni} Test'] == 1:
                while True:
                    try:
                        score = float(input(f"{uni} Score (out of {max_marks[f'{uni} Score']}): "))
                        if 0 <= score <= max_marks[f'{uni} Score']:
                            input_dict[f'{uni} Score'] = score
                            break
                        else:
                            print(f"Error: {uni} Score must be between 0 and {max_marks[f'{uni} Score']}")
                    except ValueError:
                        print("Error: Please enter a valid number")


## Predicting and Displaying Admission Results

In [None]:
        # Predict and display results
        results = predict_admission_programs(input_dict)
        print("\nResults:")
        for uni, result in results.items():
            print(f"{uni}: {result}")
    except Exception as e:
        print(f"Error: {str(e)}")

if __name__ == "__main__":
    main()


# User Testing

In [None]:
Enter student details:
Matriculation Marks (out of 1100): 990
Intermediate Marks (out of 550): 449
Have you given UET test? (Enter 1 for Yes, 0 for No): 1
UET Score (out of 400): 130
Have you given NUST test? (Enter 1 for Yes, 0 for No): 1
NUST Score (out of 200): 140
Have you given GIKI test? (Enter 1 for Yes, 0 for No): 1
GIKI Score (out of 100): 67
Have you given PIEAS test? (Enter 1 for Yes, 0 for No): 1
PIEAS Score (out of 100): 43
Have you given PUCIT test? (Enter 1 for Yes, 0 for No): 1
PUCIT Score (out of 100): 81
Have you given FAST test? (Enter 1 for Yes, 0 for No): 1
FAST Score (out of 100): 50
Have you given COMSATS test? (Enter 1 for Yes, 0 for No): 1
COMSATS Score (out of 100): 67
Have you given ITU test? (Enter 1 for Yes, 0 for No): 1
ITU Score (out of 50): 32

Results:
UET: Selected: Yes, Program: City and Regional Planning, Environmental Engineering, Geological Engineering, Transportation Engineering
     Aggregate: 66.84
NUST: Selected: Yes, Program: Electrical Engineering, Mechanical Engineering, Aerospace Engineering, Bioinformatics
     Aggregate: 73.75
GIKI: Selected: Yes, Program: Low Competitive Fields
     Aggregate: 70.11
PIEAS: Selected: Yes, Program: Low Competitive Fields
     Aggregate: 59.71
PUCIT: Selected: No, Programs: None
     Aggregate: 84.88
FAST: Selected: Yes, Program: Electrical Engineering, Civil Engineering, BBA, Accounting and Finance, Fintech
     Aggregate: 66.65
COMSATS: Selected: Yes, Program: Electrical Engineering, BBA, Psychology, Accounting and Finance
     Aggregate: 75.15
ITU: Selected: Yes, Program: Electrical Engineering, Economics with Data Science, Civil Engineering, Management Sciences
     Aggregate: 74.07