## Start of the ML model training script

### 01. Load the dataset

In [4]:
print("Hello world from the jupyter notebook!")

import pandas as pd

# Load the dataset
df = pd.read_csv("final_z_scores_with_sat_data.csv")

# View the first 5 rows
df.head()

Hello world from the jupyter notebook!


Unnamed: 0,Year,District,Programme,Stream,Number,Passed,Z-Score
0,2022,COLOMBO,MEDICINE (University of Colombo),Bio Science,40329.0,25851.0,2.4516
1,2022,GAMPAHA,MEDICINE (University of Colombo),Bio Science,40329.0,25851.0,2.4412
2,2022,KALUTARA,MEDICINE (University of Colombo),Bio Science,40329.0,25851.0,2.4446
3,2022,MATALE,MEDICINE (University of Colombo),Bio Science,40329.0,25851.0,2.4981
4,2022,KANDY,MEDICINE (University of Colombo),Bio Science,40329.0,25851.0,2.5501


### 02. Handling the missing values (`NaN`)

In [None]:
# Check for missing values in each column
missing_values = df.isnull().sum()

# Print the number of missing values for each columns
print(missing_values[missing_values > 0])

Stream     5725
Number     5725
Passed     5725
Z-Score    4542
dtype: int64


In [11]:
# Remove the rows where z-score is NaN
df.dropna(subset=['Z-Score'], inplace=True)

# For other numerical columns, fill the NaN with mean
number_mean = df['Number'].mean()
passed_mean = df['Passed'].mean()

df.loc[df['Number'].isnull(), 'Number'] = number_mean
df.loc[df['Passed'].isnull(), 'Passed'] = passed_mean 

# Verify that the removal is successful
print("Missing values in Number:", df['Number'].isnull().sum())
print("Missing values in Passed:", df['Passed'].isnull().sum())

Missing values in Number: 0
Missing values in Passed: 0


In [17]:
# Fill the missing values in the stream column with 'Unknown'
if 'Stream' in df.columns and df['Stream'].isnull().any():
    df['Stream'].fillna('Unkown', inplace=True)

print("Missing values in Steam:", df['Stream'].isnull().sum())

Missing values in Steam: 0


### 03. Feature engineering

In [18]:
# Create the passed ratio feature
# A small number has added to the denominator to avoid division by zero
df['Passed_Ratio'] = df['Passed'] / (df['Number'] + 1e-6)

# Display the first few rows with the new column
df[['Number', 'Passed', 'Passed_Ratio']].head()

Unnamed: 0,Number,Passed,Passed_Ratio
0,40329.0,25851.0,0.641003
1,40329.0,25851.0,0.641003
2,40329.0,25851.0,0.641003
3,40329.0,25851.0,0.641003
4,40329.0,25851.0,0.641003


### 04. Encode Categorical Features

In [21]:
# Select the categorical columns to encode
categorical_cols = ['District', 'Programme', 'Stream']

# One-hot encoding
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Display the first five rows
print("Original shape:", df.shape)
print("Shape after one-hot encoding:", df_encoded.shape)
df_encoded.head()

Original shape: (20383, 8)
Shape after one-hot encoding: (20383, 356)


Unnamed: 0,Year,Number,Passed,Z-Score,Passed_Ratio,District_ANURADHAPURA,District_BADULLA,District_BATTICALOA,District_COLOMBO,District_GALLE,...,Programme_URBAN BIORESOURCES (University of Sri Jayewardenepura).1,Programme_VETERINARY SCIENCE (University of Peradeniya),Programme_VISUAL & TECHNOLOGICAL ARTS # (Swami Vipulananda Institute of Aesthetic Studies),Programme_VISUAL ARTS # (University of the Visual & Performing Arts),"Programme_YOGA AND PARAPSYCHOLOGY (The Gampaha Wickramarachchi University of Indigenous Medicine, Sri Lanka)",Stream_Bio Science,Stream_Commerce,Stream_Physical Science,Stream_Technology,Stream_Unkown
0,2022,40329.0,25851.0,2.4516,0.641003,False,False,False,True,False,...,False,False,False,False,False,True,False,False,False,False
1,2022,40329.0,25851.0,2.4412,0.641003,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
2,2022,40329.0,25851.0,2.4446,0.641003,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
3,2022,40329.0,25851.0,2.4981,0.641003,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
4,2022,40329.0,25851.0,2.5501,0.641003,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
