## Start of the ML model training script

### 01. Load the dataset

In [1]:
print("Hello world from the jupyter notebook!")

import pandas as pd

# Load the dataset
df = pd.read_csv("final_z_scores_with_sat_data.csv")

# View the first 5 rows
df.head()

Hello world from the jupyter notebook!


Unnamed: 0,Year,District,Programme,Stream,Number,Passed,Z-Score
0,2022,COLOMBO,MEDICINE (University of Colombo),Bio Science,40329.0,25851.0,2.4516
1,2022,GAMPAHA,MEDICINE (University of Colombo),Bio Science,40329.0,25851.0,2.4412
2,2022,KALUTARA,MEDICINE (University of Colombo),Bio Science,40329.0,25851.0,2.4446
3,2022,MATALE,MEDICINE (University of Colombo),Bio Science,40329.0,25851.0,2.4981
4,2022,KANDY,MEDICINE (University of Colombo),Bio Science,40329.0,25851.0,2.5501


### 02. Handling the missing values (`NaN`)

In [2]:
# Check for missing values in each column
missing_values = df.isnull().sum()

# Print the number of missing values for each columns
print(missing_values[missing_values > 0])

Stream     5725
Number     5725
Passed     5725
Z-Score    4542
dtype: int64


In [3]:
# Remove the rows where z-score is NaN
df.dropna(subset=['Z-Score'], inplace=True)

# For other numerical columns, fill the NaN with mean
number_mean = df['Number'].mean()
passed_mean = df['Passed'].mean()

df.loc[df['Number'].isnull(), 'Number'] = number_mean
df.loc[df['Passed'].isnull(), 'Passed'] = passed_mean 

# Verify that the removal is successful
print("Missing values in Number:", df['Number'].isnull().sum())
print("Missing values in Passed:", df['Passed'].isnull().sum())

Missing values in Number: 0
Missing values in Passed: 0


In [4]:
# Fill the missing values in the stream column with 'Unknown'
if 'Stream' in df.columns and df['Stream'].isnull().any():
    df['Stream'].fillna('Unkown', inplace=True)

print("Missing values in Steam:", df['Stream'].isnull().sum())

Missing values in Steam: 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Stream'].fillna('Unkown', inplace=True)


### 03. Feature engineering

In [5]:
# Create the passed ratio feature
# A small number has added to the denominator to avoid division by zero
df['Passed_Ratio'] = df['Passed'] / (df['Number'] + 1e-6)

# Display the first few rows with the new column
df[['Number', 'Passed', 'Passed_Ratio']].head()

Unnamed: 0,Number,Passed,Passed_Ratio
0,40329.0,25851.0,0.641003
1,40329.0,25851.0,0.641003
2,40329.0,25851.0,0.641003
3,40329.0,25851.0,0.641003
4,40329.0,25851.0,0.641003


### 04. Encode Categorical Features

In [6]:
# Select the categorical columns to encode
categorical_cols = ['District', 'Programme', 'Stream']

# One-hot encoding
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Display the first five rows
print("Original shape:", df.shape)
print("Shape after one-hot encoding:", df_encoded.shape)
df_encoded.head()

# Assign the result back to the main DataFrame
df = df_encoded

Original shape: (20383, 8)
Shape after one-hot encoding: (20383, 356)


### 05. Define features and targets

In [7]:
# Define features by droping 'Z-Score' column
X = df.drop('Z-Score', axis=1)

# Define the target (y)
y = df['Z-Score']

# Verify
print("Shape of the features (X):", X.shape)
print("Shape of target (y)", y.shape)

Shape of the features (X): (20383, 355)
Shape of target (y) (20383,)


### 06. Spliting the train and test sets

In [9]:
# X contains the 'Year' column
# Data split chronologically (older years for train, most recent year for test)

# Find the most recent year for test
latest_year = X['Year'].max()

# Get all the rest years for training
X_train = X[X['Year'] < latest_year]
y_train = y[X['Year'] < latest_year]

# Get the latest year data for the test sets
X_test = X[X['Year'] == latest_year]
y_test = y[X['Year'] == latest_year]

# Drop the year column from features set as it's no longer needed
X_train = X_train.drop('Year', axis=1)
X_test = X_test.drop('Year', axis=1)

# Verify splits
print(f"Training data shape (X_train): {X_train.shape}")
print(f"Testing data shape (X_test): {X_test.shape}")
print(f"Training target shape (y_train): {y_train.shape}")
print(f"Testing target shape (y_test): {y_test.shape}")

Training data shape (X_train): (15402, 354)
Testing data shape (X_test): (4981, 354)
Training target shape (y_train): (15402,)
Testing target shape (y_test): (4981,)


### 07. Train the model

In [11]:
from sklearn.ensemble import RandomForestRegressor

# Initiate the Random Forest Classifier
# No of trees in the classifier = 100
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, oob_score=True)

# Train the model
rf_model.fit(X_train, y_train)

# Print the Out Of Bag score to verify
print(f"Model training complete.")
print(f"Out-Of-Bag R^2 score: {rf_model.oob_score_: .4f}")

Model training complete.
Out-Of-Bag R^2 score:  0.8226


### 08. Evaluate the model

In [12]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Predict on the test data
y_pred = rf_model.predict(X_test)

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print the metrics
print("Model Performance on the Test Set:")
print(f"R-Squred (R2): {r2:.4f}")
print(f"Mean Abosulte Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

Model Performance on the Test Set:
R-Squred (R2): 0.7243
Mean Abosulte Error (MAE): 0.1566
Root Mean Squared Error (RMSE): 0.2528
