In [9]:
import pandas as pd
import os
import re

In [10]:
# 1. Process degree_streams.csv to create a degree-to-stream map
degree_map_df = pd.read_csv('streams/degree_streams.csv')
degree_map_df.rename(columns={'Stream': 'stream', 'Degree': 'degree_name', 'Universities': 'university'}, inplace=True)

# Clean the stream names and convert degree names to uppercase
degree_map_df['stream'] = degree_map_df['stream'].str.replace(' Stream', '').str.lower()
degree_map_df['degree_name'] = degree_map_df['degree_name'].str.upper().str.strip()

print("Processed Degree-to-Stream Map :")
degree_map_df.head()

Processed Degree-to-Stream Map :


Unnamed: 0,stream,degree_name
0,arts,ARTS
1,arts,ARTS (SP)
2,arts,ARTS (SAB)
3,arts,COMMUNICATION STUDIES
4,arts,PEACE AND CONFLICT RESOLUTION


In [11]:
# 2. Process the performance data from the 'perfs' directory
def process_perfs_data(file_path):
    df = pd.read_csv(file_path)
    file_name = os.path.basename(file_path)
    parts = file_name.replace('.csv', '').split('_')
    df['stream'] = ' '.join(parts[1:-1])
    df['year'] = int(parts[-1])
    return df

perfs_files = [os.path.join('perfs', f) for f in os.listdir('perfs') if f.endswith('.csv')]
all_perfs_data = pd.concat([process_perfs_data(f) for f in perfs_files], ignore_index=True)
all_perfs_data.rename(columns={'No. Sat': 'students_sat', 'Eligible for University Entrance': 'students_passed'}, inplace=True)
all_perfs_data = all_perfs_data[['District', 'students_sat', 'students_passed', 'stream', 'year']]
all_perfs_data = all_perfs_data[all_perfs_data['District'].str.lower() != 'all island'].copy()

print("Processed Performance Data Head:")
all_perfs_data.head()

Processed Performance Data Head:


Unnamed: 0,District,students_sat,students_passed,stream,year
0,Batticaloa,898,654,commerce,2021
1,Jaffna,949,685,commerce,2021
2,Puttalam,1654,1178,commerce,2021
3,Monaragala,1210,852,commerce,2021
4,Kurunegala,3688,2583,commerce,2021


In [12]:
# 3. Process the z-score cutoff data from the 'cops' directory
def process_cops_data(file_path):
    df = pd.read_csv(file_path)
    df.rename(columns={df.columns[0]: 'District'}, inplace=True)
    year_match = re.search(r'(\d{4})', file_path)
    if year_match:
        df['year'] = int(year_match.group(1))
    else:
        raise ValueError(f"Could not extract year from {file_path}")
    return df.melt(id_vars=['District', 'year'], var_name='degree', value_name='z-score_cutoff')

cops_files = [os.path.join('cops', f) for f in os.listdir('cops') if f.startswith('cop_') and f.endswith('.csv')]
all_cops_data = pd.concat([process_cops_data(f) for f in cops_files], ignore_index=True)

print("Processed Z-Score Data Head:")
all_cops_data.head()

Processed Z-Score Data Head:


Unnamed: 0,District,year,degree,z-score_cutoff
0,COLOMBO,2019,MEDICINE University of Colombo,2.4546
1,GAMPAHA,2019,MEDICINE University of Colombo,2.501
2,KALUTARA,2019,MEDICINE University of Colombo,2.5199
3,MATALE,2019,MEDICINE University of Colombo,2.5115
4,KANDY,2019,MEDICINE University of Colombo,2.5805


In [13]:
# 4. Add the 'stream' to the z-score data with robust name extraction

def extract_degree_name_from_string(degree_string):
    words = str(degree_string).split()
    degree_name_parts = []
    for word in words:
        # A word is part of the degree name if it is all uppercase.
        # Also allow ampersands and hyphens.
        if word.isupper() or word in ['&', '-']:
            degree_name_parts.append(word)
        else:
            # The first word not in all caps marks the start of the university name.
            break
    return ' '.join(degree_name_parts).strip()

all_cops_data['degree_name'] = all_cops_data['degree'].apply(extract_degree_name_from_string)

# The degree_map_df already has the 'degree_name' in uppercase
stream_map = degree_map_df[['degree_name', 'stream']].drop_duplicates()

all_cops_data_with_stream = pd.merge(all_cops_data, stream_map, on='degree_name', how='left')

print(f"Rows in cops data: {len(all_cops_data)}")
print(f"Rows with a matched stream: {all_cops_data_with_stream['stream'].notna().sum()}")
print(f"Rows MISSING a stream: {all_cops_data_with_stream['stream'].isna().sum()}")

if all_cops_data_with_stream['stream'].isna().any():
    unmatched = all_cops_data_with_stream[all_cops_data_with_stream['stream'].isna()]['degree'].value_counts()
    print("Top 5 Unmatched Degrees:")
    print(unmatched.head())

Rows in cops data: 25450
Rows with a matched stream: 22225
Rows MISSING a stream: 3225
Top 5 Unmatched Degrees:
degree
PEACE & CONFLICT RESOLUTION  University of Kelaniya                     125
TOWN & COUNTRY PLANNING University of Moratuwa                          125
AGRI BUSINESS MANAGEMENT University of Ruhuna                           125
MARINE AND FRESHWATER SCIENCES University of Ruhuna                     125
COMPUTING & INFORMATION SYSTEMS Sabaragamuwa University of Sri Lanka    125
Name: count, dtype: int64


In [14]:
# 5. Perform the final merge and save the dataset

final_cops_data = all_cops_data_with_stream.dropna(subset=['stream']).copy()

# Standardize the 'District' column in both dataframes
final_cops_data['District'] = final_cops_data['District'].str.strip().str.lower()
all_perfs_data['District'] = all_perfs_data['District'].str.strip().str.lower()

# Merge on the correct composite key
merged_df = pd.merge(final_cops_data, all_perfs_data, on=['District', 'year', 'stream'])

# Clean the z-score data
final_df = merged_df[merged_df['z-score_cutoff'] != 'NQC'].copy()
final_df['z-score_cutoff'] = pd.to_numeric(final_df['z-score_cutoff'])

# Select and reorder the final columns
final_df = final_df[['District', 'stream', 'degree', 'year', 'students_sat', 'students_passed', 'z-score_cutoff']]

final_df.to_csv('processed_datasets/combined_dataset.csv', index=False)

print("Final Merged and Cleaned Data Head:")
final_df.head()

Final Merged and Cleaned Data Head:


Unnamed: 0,District,stream,degree,year,students_sat,students_passed,z-score_cutoff
0,colombo,biological science,MEDICINE University of Colombo,2019,2688,1838,2.4546
1,gampaha,biological science,MEDICINE University of Colombo,2019,1648,970,2.501
2,kalutara,biological science,MEDICINE University of Colombo,2019,1018,609,2.5199
3,matale,biological science,MEDICINE University of Colombo,2019,443,179,2.5115
4,kandy,biological science,MEDICINE University of Colombo,2019,1615,860,2.5805


In [15]:
# 6. Feature Engineering and Encoding for Machine Learning

df = pd.read_csv('processed_datasets/combined_dataset.csv')

# --- Feature Engineering ---
# 1. Create pass_rate
# Replace 0s in students_sat to avoid division by zero, then calculate rate
df['pass_rate'] = df['students_passed'] / df['students_sat'].replace(0, 1) 

# 2. Create a time index from the year
df['time_index'] = df['year'] - df['year'].min()

# Define features and target. We will use the new engineered features.
features = ['District', 'stream', 'degree', 'time_index', 'pass_rate']
target = 'z-score_cutoff'

X = df[features]
y = df[target]

# --- One-Hot Encoding ---
categorical_cols = ['District', 'stream', 'degree']
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
X_encoded.columns = X_encoded.columns.str.replace(r'[\"\[\]\{\},:]', '', regex=True)

# Save the final feature-engineered and encoded dataset
final_encoded_df = pd.concat([X_encoded, y], axis=1)
final_encoded_df.to_csv('processed_datasets/engineered_encoded_dataset.csv', index=False)

print("Feature-Engineered and Encoded Data Head (Ready for ML):")
final_encoded_df.head()

Feature-Engineered and Encoded Data Head (Ready for ML):


Unnamed: 0,time_index,pass_rate,District_anuradhapura,District_badulla,District_batticaloa,District_colombo,District_galle,District_gampaha,District_hambantota,District_jaffna,...,degree_SOCIAL WORK University of Jayewardenepura,degree_SOCIAL WORK University of Peradeniya,degree_SPEECH AND HEARING SCIENCES University of Kelaniya,degree_STATISTICS & OPERATIONS RESEARCH University of Peradeniya,degree_URBAN BIORESOURCES .1 University of Sri Jayewardenepura,degree_URBAN BIORESOURCES University of Sri Jayewardenepura,degree_VETERINARY SCIENCE University of Peradeniya,degree_VISUAL & TECHNOLOGICAL ARTS Swami Vipulananda Institute of Aesthetic Studies,degree_VISUAL ARTS University of the Visual & Performing Arts,z-score_cutoff
0,0,0.68378,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,2.4546
1,0,0.588592,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,2.501
2,0,0.598232,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,2.5199
3,0,0.404063,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,2.5115
4,0,0.532508,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,2.5805


In [16]:
# 7. Train and Evaluate a Time-Series Model

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

# Load the feature-engineered and encoded dataset
df = pd.read_csv('processed_datasets/engineered_encoded_dataset.csv')

# --- Time-Based Splitting ---
# Identify the latest year for the test set
latest_year_index = df['time_index'].max()

# Split the data based on the time_index
train_df = df[df['time_index'] < latest_year_index]
test_df = df[df['time_index'] == latest_year_index]

# Separate features (X) and target (y)
X_train = train_df.drop(columns=['z-score_cutoff'])
y_train = train_df['z-score_cutoff']
X_test = test_df.drop(columns=['z-score_cutoff'])
y_test = test_df['z-score_cutoff']

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

# --- Model Training ---
# Initialize and train the LightGBM Regressor
lgbm = lgb.LGBMRegressor(random_state=42)
lgbm.fit(X_train, y_train)

# --- Model Evaluation ---
# Make predictions on the test set
y_pred = lgbm.predict(X_test)

# Calculate and print evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error (MAE) on the test set: {mae:.4f}')
print(f'R-squared (R2) on the test set: {r2:.4f}')

Training data shape: (7801, 189)
Testing data shape: (2805, 189)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000153 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 596
[LightGBM] [Info] Number of data points in the train set: 7801, number of used features: 177
[LightGBM] [Info] Start training from score 1.284264
Mean Absolute Error (MAE) on the test set: 0.1317
R-squared (R2) on the test set: 0.7852


In [None]:
import joblib

# Save the model
joblib.dump(lgbm, 'model/model.joblib')

['model/model.joblib']

In [28]:
import pandas as pd
import joblib

def predict(degree, stream, district):
    """
    Predict z-score cutoff for a given degree, stream, and district.
    Now includes validation to ensure inputs exist in training data.
    
    Args:
        degree (str): The degree name (e.g., "MEDICINE University of Colombo")
        stream (str): The stream name (e.g., "biological science") 
        district (str): The district name (e.g., "Colombo")
    
    Returns:
        float: Predicted z-score cutoff, or None if inputs are invalid
    """
    try:
        # Load the datasets first to validate inputs
        training_data = pd.read_csv('processed_datasets/final_dataset.csv')
        final_dataset = pd.read_csv('processed_datasets/final_dataset_engineered_encoded.csv')
        
        # Normalize inputs to match the data format
        stream = stream.lower().strip()
        district = district.lower().strip()
        degree = degree.upper().strip()
        
        # VALIDATION: Check if inputs exist in training data
        available_degrees = set(training_data['degree'].str.upper().str.strip())
        available_streams = set(training_data['stream'].str.lower().str.strip())
        available_districts = set(training_data['District'].str.lower().str.strip())
        
        # Validate degree
        if degree not in available_degrees:
            print(f"ERROR: Degree '{degree}' not found in training data.")
            print(f"Available degrees containing your search term:")
            matching_degrees = [d for d in available_degrees if any(word in d for word in degree.split())]
            for d in sorted(matching_degrees)[:5]:
                print(f"   - {d}")
            return None
            
        # Validate stream
        if stream not in available_streams:
            print(f"ERROR: Stream '{stream}' not found in training data.")
            print(f"Available streams: {sorted(available_streams)}")
            return None
            
        # Validate district (with partial matching)
        if district not in available_districts:
            print(f"!WARNING: District '{district}' not found in training data.")
            # Try partial matching
            matching_districts = [d for d in available_districts if district in d or d in district]
            if matching_districts:
                district = matching_districts[0]
                print(f"Using closest match: '{district}'")
            else:
                print(f"Available districts: {sorted(available_districts)}")
                return None
        
        # Load the model after validation passes
        lgbm = joblib.load('model/model.joblib')
        
        # Filter performance data for the given stream and district
        stream_district_perf = training_data[
            (training_data['stream'] == stream) & 
            (training_data['District'] == district)
        ]
        
        if stream_district_perf.empty:
            # If no exact match, try to find data for the stream across all districts
            stream_perf = training_data[training_data['stream'] == stream]
            if stream_perf.empty:
                raise ValueError(f"No performance data found for stream: {stream}")
            
            # Use the most recent data across all districts for this stream
            latest_year = stream_perf['year'].max()
            latest_data = stream_perf[stream_perf['year'] == latest_year]
            
            # Use average performance across districts
            latest_year_perf = {
                'students_sat': latest_data['students_sat'].mean(),
                'students_passed': latest_data['students_passed'].mean()
            }
            print(f"! Using average performance across all districts for {stream}")
        else:
            # Get the most recent data for this specific stream and district
            latest_year = stream_district_perf['year'].max()
            latest_data = stream_district_perf[stream_district_perf['year'] == latest_year]
            latest_year_perf = {
                'students_sat': latest_data['students_sat'].iloc[0],
                'students_passed': latest_data['students_passed'].iloc[0]
            }
        
        # Get the minimum year from the original dataset for time_index calculation
        min_year = training_data['year'].min()
        
        # Create prediction dataframe
        new_data = pd.DataFrame({
            'District': [district],
            'stream': [stream], 
            'degree': [degree],
            'time_index': [2024 - min_year],  # Using 2024 as prediction year
            'pass_rate': [latest_year_perf['students_passed'] / max(latest_year_perf['students_sat'], 1)]
        })
        
        # One-Hot Encoding to match training data format
        categorical_cols = ['District', 'stream', 'degree']
        new_data_encoded = pd.get_dummies(new_data, columns=categorical_cols, drop_first=True)
        
        # Clean column names to match training data
        new_data_encoded.columns = new_data_encoded.columns.str.replace(r'[\"\[\]\{\},:]', '', regex=True)
        
        # Get the feature columns from the training data
        X_train_columns = final_dataset.drop(columns=['z-score_cutoff']).columns.tolist()
        
        # Reindex to match training data columns (fill missing columns with 0)
        new_data_encoded = new_data_encoded.reindex(columns=X_train_columns, fill_value=0)
        
        # Check if this combination actually existed in training
        original_combo = training_data[
            (training_data['degree'].str.upper().str.strip() == degree) &
            (training_data['stream'].str.lower().str.strip() == stream) &
            (training_data['District'].str.lower().str.strip() == district)
        ]
        
        # Make prediction
        prediction = lgbm.predict(new_data_encoded)
        
        return prediction[0]
        
    except Exception as e:
        print(f"Error in prediction: {str(e)}")
        return None


def get_top_5_accessible_degrees(user_z_score, stream, district):
    """
    Find top 5 degrees that a candidate can get admitted to based on their z-score.
    
    Args:
        user_z_score (float): The candidate's z-score
        stream (str): The stream name (e.g., "biological science")
        district (str): The district name (e.g., "Colombo")
    
    Returns:
        list: List of dictionaries containing degree info and predicted cutoffs
    """
    try:
        # Load the dataset
        training_data = pd.read_csv('processed_datasets/final_dataset.csv')
        
        # Normalize inputs
        stream = stream.lower().strip()
        district = district.lower().strip()
        
        # Get all unique degrees for the given stream and district
        stream_district_data = training_data[
            (training_data['stream'].str.lower().str.strip() == stream) & 
            (training_data['District'].str.lower().str.strip() == district)
        ]
        
        if stream_district_data.empty:
            print(f"No data found for stream '{stream}' in district '{district}'")
            return []
        
        # Get unique degrees
        unique_degrees = stream_district_data['degree'].str.upper().str.strip().unique()
        
        print(f"!!Found {len(unique_degrees)} degrees for {stream} stream in {district} district")
        print("Getting predictions for all of them now...")
        
        accessible_degrees = []
        
        # Predict z-score cutoff for each degree
        for degree in unique_degrees:
            predicted_cutoff = predict(degree, stream, district)
            
            if predicted_cutoff is not None:
                # Check if candidate's z-score is higher than the predicted cutoff
                if user_z_score >= predicted_cutoff:
                    accessible_degrees.append({
                        'degree': degree,
                        'predicted_cutoff': predicted_cutoff,
                        'margin': user_z_score - predicted_cutoff  # How much above the cutoff
                    })
            else:
                print(f"{degree}: Could not predict cutoff")
        
        # Sort by predicted cutoff in descending order (highest cutoff first = most competitive/prestigious)
        accessible_degrees.sort(key=lambda x: x['predicted_cutoff'], reverse=True)
        
        # Return top 5
        top_5 = accessible_degrees[:5]
        
        print("\n" + "=" * 80)
        print(f"TOP 5 DEGREES YOU CAN GET INTO (Your Z-Score: {user_z_score})")
        print("=" * 80)
        
        if not top_5:
            print(":( Unfortunately, no degrees found that you can get into with your current z-score.")
            print("Consider retaking the exam or exploring other streams/districts.")
        else:
            for i, degree_info in enumerate(top_5, 1):
                print(f"{i}. {degree_info['degree']}")
                print(f"Predicted Cutoff: {degree_info['predicted_cutoff']:.4f}")
                print(f"Your Margin: +{degree_info['margin']:.4f}")
                print()
        
        return top_5
        
    except Exception as e:
        print(f"Error in getting accessible degrees: {str(e)}")
        return []

if __name__ == '__main__':
    print("COMPREHENSIVE DEGREE RECOMMENDATION SYSTEM")
    print("=" * 80)
    
    # Test parameters
    user_z_score = 2.3  # Example z-score
    stream = "biological science"
    district = "colombo"
    
    print(f"User Z-Score: {user_z_score}")
    print(f"Stream: {stream}")
    print(f"District: {district}")
    print()
    
    # Get top 5 accessible degrees
    top_5 = get_top_5_accessible_degrees(user_z_score, stream, district)

COMPREHENSIVE DEGREE RECOMMENDATION SYSTEM
User Z-Score: 2.3
Stream: biological science
District: colombo

!!Found 70 degrees for biological science stream in colombo district
Getting predictions for all of them now...

TOP 5 DEGREES YOU CAN GET INTO (Your Z-Score: 2.3)
1. MEDICINE UNIVERSITY OF COLOMBO
Predicted Cutoff: 1.2207
Your Margin: +1.0793

2. MEDICINE UNIVERSITY OF PERADENIYA
Predicted Cutoff: 1.2207
Your Margin: +1.0793

3. MEDICINE UNIVERSITY OF SRI JAYEWARDENEPURA
Predicted Cutoff: 1.2207
Your Margin: +1.0793

4. MEDICINE UNIVERSITY OF KELANIYA
Predicted Cutoff: 1.2207
Your Margin: +1.0793

5. MEDICINE UNIVERSITY OF JAFFNA
Predicted Cutoff: 1.2207
Your Margin: +1.0793

