In [None]:
#import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [13]:
student_data = pd.read_csv(rf"C:\Users\Wahaj Sajid\Desktop\Datasets\StudentPerformanceFactors.csv")

In [17]:
student_data.shape

(6607, 19)

In [14]:
#showing the data
student_data.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


In [37]:
#check for the null values
student_data.isnull().sum()

Hours_Studied                 0
Attendance                    0
Parental_Involvement          0
Access_to_Resources           0
Extracurricular_Activities    0
Sleep_Hours                   0
Previous_Scores               0
Motivation_Level              0
Internet_Access               0
Tutoring_Sessions             0
Family_Income                 0
Teacher_Quality               0
School_Type                   0
Peer_Influence                0
Physical_Activity             0
Learning_Disabilities         0
Parental_Education_Level      0
Gender                        0
Exam_Score                    0
dtype: int64

In [16]:
#droping the "Distance_from_Home" column as it is not that important
student_data = student_data.drop(columns=["Distance_from_Home"], axis=1)

In [36]:
#imputing null values
simple_imputer = SimpleImputer(strategy='most_frequent', fill_value='missing')
for col in ["Teacher_Quality", "Parental_Education_Level"]:
    student_data[col] = simple_imputer.fit_transform(student_data[[col]]).ravel()

In [38]:
#checking for the unique categories in column
object_data = student_data.select_dtypes(exclude=['int', 'bool'])
for col in object_data.columns:
    unique_categories = object_data[col].unique()
    print(f"{col}: {unique_categories}")

 

Parental_Involvement: ['Low' 'Medium' 'High']
Access_to_Resources: ['High' 'Medium' 'Low']
Extracurricular_Activities: ['No' 'Yes']
Motivation_Level: ['Low' 'Medium' 'High']
Internet_Access: ['Yes' 'No']
Family_Income: ['Low' 'Medium' 'High']
Teacher_Quality: ['Medium' 'High' 'Low']
School_Type: ['Public' 'Private']
Peer_Influence: ['Positive' 'Negative' 'Neutral']
Learning_Disabilities: ['No' 'Yes']
Parental_Education_Level: ['High School' 'College' 'Postgraduate']
Gender: ['Male' 'Female']


In [40]:
#make the dummies of columns with three categorical variables
student_data = pd.get_dummies(student_data, columns=['Parental_Involvement',"Access_to_Resources", "Motivation_Level", "Family_Income", "Teacher_Quality","Peer_Influence", "Parental_Education_Level"])

In [41]:
#data after making dumies
student_data.head()

Unnamed: 0,Hours_Studied,Attendance,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Internet_Access,Tutoring_Sessions,School_Type,Physical_Activity,Learning_Disabilities,Gender,Exam_Score,Parental_Involvement_High,Parental_Involvement_Low,Parental_Involvement_Medium,Access_to_Resources_High,Access_to_Resources_Low,Access_to_Resources_Medium,Motivation_Level_High,Motivation_Level_Low,Motivation_Level_Medium,Family_Income_High,Family_Income_Low,Family_Income_Medium,Teacher_Quality_High,Teacher_Quality_Low,Teacher_Quality_Medium,Peer_Influence_Negative,Peer_Influence_Neutral,Peer_Influence_Positive,Parental_Education_Level_College,Parental_Education_Level_High School,Parental_Education_Level_Postgraduate
0,23,84,No,7,73,Yes,0,Public,3,No,Male,67,False,True,False,True,False,False,False,True,False,False,True,False,False,False,True,False,False,True,False,True,False
1,19,64,No,8,59,Yes,2,Public,4,No,Female,61,False,True,False,False,False,True,False,True,False,False,False,True,False,False,True,True,False,False,True,False,False
2,24,98,Yes,7,91,Yes,2,Public,4,No,Male,74,False,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,True,False,False,False,True
3,29,89,Yes,8,98,Yes,1,Public,4,No,Male,71,False,True,False,False,False,True,False,False,True,False,False,True,False,False,True,True,False,False,False,True,False
4,19,92,Yes,6,65,Yes,3,Public,4,No,Female,70,False,False,True,False,False,True,False,False,True,False,False,True,True,False,False,False,True,False,True,False,False


In [43]:
#encode the categorical data
encoder = LabelEncoder()
data_with_categories = student_data.select_dtypes(exclude=['int'])
for col in data_with_categories.columns:
    student_data[col] = encoder.fit_transform(student_data[col])

In [44]:
#data after encoding the data

student_data.head()

Unnamed: 0,Hours_Studied,Attendance,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Internet_Access,Tutoring_Sessions,School_Type,Physical_Activity,Learning_Disabilities,Gender,Exam_Score,Parental_Involvement_High,Parental_Involvement_Low,Parental_Involvement_Medium,Access_to_Resources_High,Access_to_Resources_Low,Access_to_Resources_Medium,Motivation_Level_High,Motivation_Level_Low,Motivation_Level_Medium,Family_Income_High,Family_Income_Low,Family_Income_Medium,Teacher_Quality_High,Teacher_Quality_Low,Teacher_Quality_Medium,Peer_Influence_Negative,Peer_Influence_Neutral,Peer_Influence_Positive,Parental_Education_Level_College,Parental_Education_Level_High School,Parental_Education_Level_Postgraduate
0,23,84,0,7,73,1,0,1,3,0,1,67,0,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0
1,19,64,0,8,59,1,2,1,4,0,0,61,0,1,0,0,0,1,0,1,0,0,0,1,0,0,1,1,0,0,1,0,0
2,24,98,1,7,91,1,2,1,4,0,1,74,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1
3,29,89,1,8,98,1,1,1,4,0,1,71,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,1,0,0,0,1,0
4,19,92,1,6,65,1,3,1,4,0,0,70,0,0,1,0,0,1,0,0,1,0,0,1,1,0,0,0,1,0,1,0,0


In [57]:
#defining x and y of the data
x = student_data.drop(columns=['Exam_Score'], axis=1)
y = student_data['Exam_Score']

In [58]:
#train, test split the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [59]:
x_train_df = pd.DataFrame(x_train)
y_train_df = pd.DataFrame(y_train)

x_test_df = pd.DataFrame(x_test)
y_test_df = pd.DataFrame(y_test)

train_data = pd.concat([x_train_df, y_train_df], axis=1)
test_data = pd.concat([x_test_df, y_test_df], axis=1)


In [60]:
#train data display
train_data.head()

Unnamed: 0,Hours_Studied,Attendance,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Internet_Access,Tutoring_Sessions,School_Type,Physical_Activity,Learning_Disabilities,Gender,Parental_Involvement_High,Parental_Involvement_Low,Parental_Involvement_Medium,Access_to_Resources_High,Access_to_Resources_Low,Access_to_Resources_Medium,Motivation_Level_High,Motivation_Level_Low,Motivation_Level_Medium,Family_Income_High,Family_Income_Low,Family_Income_Medium,Teacher_Quality_High,Teacher_Quality_Low,Teacher_Quality_Medium,Peer_Influence_Negative,Peer_Influence_Neutral,Peer_Influence_Positive,Parental_Education_Level_College,Parental_Education_Level_High School,Parental_Education_Level_Postgraduate,Exam_Score
5810,27,79,1,8,63,1,2,1,5,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,1,1,0,0,1,0,0,69
1268,16,86,1,7,94,1,2,1,3,0,0,1,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,1,0,0,1,0,69
414,22,87,0,8,83,1,1,1,1,0,1,0,1,0,0,0,1,0,1,0,0,1,0,0,0,1,0,1,0,1,0,0,66
4745,18,100,1,10,86,1,1,1,3,0,1,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,72
654,35,78,1,10,99,1,1,0,2,0,1,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,1,0,1,0,72


In [61]:
#test data display
test_data.head()

Unnamed: 0,Hours_Studied,Attendance,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Internet_Access,Tutoring_Sessions,School_Type,Physical_Activity,Learning_Disabilities,Gender,Parental_Involvement_High,Parental_Involvement_Low,Parental_Involvement_Medium,Access_to_Resources_High,Access_to_Resources_Low,Access_to_Resources_Medium,Motivation_Level_High,Motivation_Level_Low,Motivation_Level_Medium,Family_Income_High,Family_Income_Low,Family_Income_Medium,Teacher_Quality_High,Teacher_Quality_Low,Teacher_Quality_Medium,Peer_Influence_Negative,Peer_Influence_Neutral,Peer_Influence_Positive,Parental_Education_Level_College,Parental_Education_Level_High School,Parental_Education_Level_Postgraduate,Exam_Score
743,20,71,0,7,87,1,1,1,5,0,1,0,0,1,0,1,0,1,0,0,0,0,1,0,0,1,1,0,0,0,1,0,65
5551,22,71,1,7,98,1,2,1,2,0,0,0,0,1,0,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,1,0,65
3442,21,91,1,6,53,1,1,1,3,0,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,71
6571,12,91,1,8,81,1,0,1,4,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,64
4204,21,63,1,8,95,1,2,1,5,0,1,0,1,0,1,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0,1,0,66


In [62]:
#Scaling the data
x_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()

#scale the x
x_train = x_scaler.fit_transform(x_train)
x_test = x_scaler.transform(x_test)


#scale the y
y_train = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test = y_scaler.transform(y_test.values.reshape(-1,1))


In [63]:
#train the model
model = GradientBoostingRegressor(learning_rate=0.1, n_estimators=2000)
model.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


In [68]:
#evalute the model using training data
train_predictions = model.predict(x_train)
mse_train = mean_squared_error(y_train, train_predictions)
mae_train = mean_absolute_error(y_train, train_predictions)
print(f"mean squared error: ",mse_train)
print(f"mean absolute error ", mae_train)

mean squared error:  0.0009885898504227537
mean absolute error  0.013295611028597646


In [70]:
#evaluate the model using testing data
test_predictions = model.predict(x_test)
mse_test = mean_squared_error(y_test, test_predictions)
mae_test = mean_absolute_error(y_test, test_predictions)
print(f"mean squared error: ",mse_test)
print(f"mean absolute error ", mae_test)

mean squared error:  0.0022153632067879693
mean absolute error  0.020245871176081842


In [78]:
#test the model using the actual train values

values = [[20, 71, 0, 7, 87, 1, 1, 1, 5, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0]]
scales_values = x_scaler.transform(values)

predict = model.predict(scales_values)
print(y_scaler.inverse_transform([predict]))


[[65.35644958]]


