In [13]:
import pandas as pd

# Reading the CSV file using pandas
data = pd.read_csv("data.csv")
data.columns.tolist()

['Student ID',
 'Age',
 'Gender',
 'Special Needs',
 'Attendance (%)',
 'Punctuality (%)',
 'Study Hour/Week',
 'Parental Occupation',
 'Parental Education Level',
 'Household Income',
 'Parental Digital Literacy',
 'Teacher Meetings (per year)',
 'Homework Help (per week)',
 'Library Hours',
 'Computer Access',
 'Teacher-Student Ratio',
 'Access to learning Materials',
 'Activity',
 'Hours/Week',
 'Leadership Role',
 'Class Level',
 'Midterm Score',
 'Exam Score',
 'Improvements Over Time',
 'JAMB Mock Score',
 'Weak Subject']

In [14]:
#add new column to sum uo the exam score
data["End Score"] = data["Midterm Score"] + data["Exam Score"]
# Display the first few rows of the dataframe
data.head()

Unnamed: 0,Student ID,Age,Gender,Special Needs,Attendance (%),Punctuality (%),Study Hour/Week,Parental Occupation,Parental Education Level,Household Income,...,Activity,Hours/Week,Leadership Role,Class Level,Midterm Score,Exam Score,Improvements Over Time,JAMB Mock Score,Weak Subject,End Score
0,STU001,14,Female,No,89.21,93.86,4,Lawyer,Doctoral,Low,...,JETS Club,3,Yes,JSS1,22.11,35.62,Declined,0,Basic Science,57.73
1,STU002,11,Male,Yes,80.34,64.69,6,Farmer,Bachelor,High,...,Maths Club,3,Yes,JSS2,35.66,0.24,No Change,0,Basic Science,35.9
2,STU003,17,Female,No,63.28,84.26,20,Trader,Masters,High,...,,5,Yes,JSS3,25.95,48.61,No Change,0,Basic Tech,74.56
3,STU004,13,Female,Yes,72.11,59.5,2,Banker,Masters,High,...,Debate & Press Club,2,Yes,SS1,6.2,4.39,Declined,0,English,10.59
4,STU005,18,Female,No,81.37,91.5,20,Lawyer,Doctoral,Low,...,Maths Club,5,No,SS2,15.84,58.1,Declined,0,Biology,73.94


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 27 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Student ID                    600 non-null    object 
 1   Age                           600 non-null    int64  
 2   Gender                        600 non-null    object 
 3   Special Needs                 600 non-null    object 
 4   Attendance (%)                600 non-null    float64
 5   Punctuality (%)               600 non-null    float64
 6   Study Hour/Week               600 non-null    int64  
 7   Parental Occupation           600 non-null    object 
 8   Parental Education Level      600 non-null    object 
 9   Household Income              600 non-null    object 
 10  Parental Digital Literacy     600 non-null    object 
 11  Teacher Meetings (per year)   600 non-null    int64  
 12  Homework Help (per week)      600 non-null    int64  
 13  Libra

## Feature Engineering

In [20]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

df = data.copy()

#Binary Encoding
df["Special Needs"] = df["Special Needs"].apply(lambda x: 1 if x == "Yes" else 0)
df["Parental Digital Literacy"] = df["Parental Digital Literacy"].apply(lambda x: 1 if x == "Yes" else 0)
df["Leadership Role"] = df["Leadership Role"].apply(lambda x: 1 if x == "Yes" else 0)
df["Gender"] = df["Gender"].apply(lambda x: 1 if x == "Male" else 0)

# One-Hot encoding
categorical_columns = ['Parental Occupation', 'Parental Education Level', 'Household Income', 'Activity', 'Computer Access', 'Access to learning Materials', 'Weak Subject']
df = pd.get_dummies(df, columns = categorical_columns, drop_first= True, dtype=int)

#fixing the "improvement" over time to numerical value
improvement_mappng = {"Improved": 1, "No Change": 0, "Declined": -1}
df["Improvements Over Time"] = df["Improvements Over Time"].map(improvement_mappng)

#Coverting class level to numerical value
class_mapping = {'JSS1': 1, 'JSS2': 2, 'JSS3': 3, 'SS1': 4, 'SS2': 5, 'SS3': 6}
df['Class Level'] = df['Class Level'].map(class_mapping)

df.drop(["Student ID"], axis=1, inplace = True)

target = "End Score"

df.head()

Unnamed: 0,Age,Gender,Special Needs,Attendance (%),Punctuality (%),Study Hour/Week,Parental Digital Literacy,Teacher Meetings (per year),Homework Help (per week),Library Hours,...,Computer Access_Often,Access to learning Materials_Both,Access to learning Materials_Digital Platform,Weak Subject_Basic Tech,Weak Subject_Biology,Weak Subject_Chemistry,Weak Subject_Civic Education,Weak Subject_English,Weak Subject_Mathematics,Weak Subject_Physics
0,14,0,0,89.21,93.86,4,1,8,4,3,...,0,1,0,0,0,0,0,0,0,0
1,11,1,1,80.34,64.69,6,0,11,5,1,...,0,0,0,0,0,0,0,0,0,0
2,17,0,0,63.28,84.26,20,1,6,5,1,...,0,0,0,1,0,0,0,0,0,0
3,13,0,1,72.11,59.5,2,0,12,0,2,...,1,0,1,0,0,0,0,1,0,0
4,18,0,0,81.37,91.5,20,0,0,3,6,...,1,0,0,0,1,0,0,0,0,0


In [21]:
print(df.dtypes)

Age                                                int64
Gender                                             int64
Special Needs                                      int64
Attendance (%)                                   float64
Punctuality (%)                                  float64
Study Hour/Week                                    int64
Parental Digital Literacy                          int64
Teacher Meetings (per year)                        int64
Homework Help (per week)                           int64
Library Hours                                      int64
Teacher-Student Ratio                              int64
Hours/Week                                         int64
Leadership Role                                    int64
Class Level                                        int64
Midterm Score                                    float64
Exam Score                                       float64
Improvements Over Time                             int64
JAMB Mock Score                

## Modelling

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
#import ace_tools as tools
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

#split the features
X = df.drop(columns = ["End Score"])
y = df["End Score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

#dictonary to sstore data
model_results = {}

def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    """ evaluation metricies"""
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return {'MSE': mse, "MAE": mae, "R2": r2}

linear_reg = LinearRegression()
model_results["Linear Regression"] = evaluate_model(linear_reg, X_train, X_test, y_train, y_test)

ridge_reg = Ridge()
model_results["Ridge Regression"] = evaluate_model(ridge_reg, X_train, X_test, y_train, y_test)

lasso_reg = Lasso()
model_results["Lasso Regression"] = evaluate_model(lasso_reg, X_train, X_test, y_train, y_test)

random_forest_reg = RandomForestRegressor()
model_results["Random Forest Regressor"] = evaluate_model(random_forest_reg, X_train, X_test, y_train, y_test)

xgb_reg = XGBRegressor(n_estimators = 100, learning_rate= 0.1, random_state = 42)
model_results["XGBoost Regressor"] = evaluate_model(xgb_reg, X_train, X_test, y_train, y_test)

model_results_df = pd.DataFrame(model_results)

model_results_df

Unnamed: 0,Linear Regression,Ridge Regression,Lasso Regression,Random Forest Regressor,XGBoost Regressor
MSE,5.655934e-25,1.048269e-07,0.010716,4.605724,3.456792
MAE,5.259028e-13,0.000266406,0.088223,1.624615,1.421886
R2,1.0,1.0,0.999976,0.989681,0.992255
