In [34]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

df = pd.read_csv("StudentPerformanceFactors.csv")

df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


In [35]:
def calc_cgpa(score):
    if score >= 70:
        return 4
    elif score >= 60:
        return 3
    elif score >= 50:
        return 2
    elif score >= 40:
        return 1
    else:
        return 0



In [36]:

df['Previous_cgpa'] =np.array( [calc_cgpa(x) for x in df['Previous_Scores']], dtype=np.uint8)
df['Exam_Score_cgpa'] = np.array([calc_cgpa(y) for y in df['Exam_Score']], dtype=np.uint8)


In [37]:
# 1. Label Encoding (for binary categorical variables like 'Purchased')
label_encoder = LabelEncoder()
df['Access_to_Resources_Encoded'] = label_encoder.fit_transform(df['Access_to_Resources'])
df['Motivation_Level_Encoded'] = label_encoder.fit_transform(df['Motivation_Level'])
df['Family_Income_Encoded'] = label_encoder.fit_transform(df['Family_Income'])
df['Teacher_Quality_Encoded'] = label_encoder.fit_transform(df['Teacher_Quality'])
df['School_Type_Encoded'] = label_encoder.fit_transform(df['School_Type'])
df['Peer_Influence_Encoded'] = label_encoder.fit_transform(df['Peer_Influence'])
df['Parental_Education_Level_Encoded'] = label_encoder.fit_transform(df['Parental_Education_Level'])
df['Distance_from_Home_Encoded'] = label_encoder.fit_transform(df['Distance_from_Home'])
df['Gender_Encoded'] = label_encoder.fit_transform(df['Gender'])
df['Parental_Involvement_Encoded'] = label_encoder.fit_transform(df['Parental_Involvement'])
df['Extracurricular_Activities_Encoded'] = label_encoder.fit_transform(df['Extracurricular_Activities'])
df['Internet_Access_Encoded'] = label_encoder.fit_transform(df['Internet_Access'])
df['Learning_Disabilities_Encoded'] = label_encoder.fit_transform(df['Learning_Disabilities'])



In [38]:
# Drop multiple columns 
columns_to_drop = ['Access_to_Resources', 'Motivation_Level', 'Family_Income', 'Teacher_Quality', 'School_Type', 'Peer_Influence', 'Parental_Education_Level', 'Distance_from_Home', 'Gender', 'Parental_Involvement', 'Extracurricular_Activities', 'Internet_Access', 'Learning_Disabilities']
df = df.drop(columns=columns_to_drop)
df.head()

Unnamed: 0,Hours_Studied,Attendance,Sleep_Hours,Previous_Scores,Tutoring_Sessions,Physical_Activity,Exam_Score,Previous_cgpa,Exam_Score_cgpa,Access_to_Resources_Encoded,...,Teacher_Quality_Encoded,School_Type_Encoded,Peer_Influence_Encoded,Parental_Education_Level_Encoded,Distance_from_Home_Encoded,Gender_Encoded,Parental_Involvement_Encoded,Extracurricular_Activities_Encoded,Internet_Access_Encoded,Learning_Disabilities_Encoded
0,23,84,7,73,0,3,67,4,3,0,...,2,1,2,1,2,1,1,0,1,0
1,19,64,8,59,2,4,61,2,3,2,...,2,1,0,0,1,0,1,0,1,0
2,24,98,7,91,2,4,74,4,4,2,...,2,1,1,2,2,1,2,1,1,0
3,29,89,8,98,1,4,71,4,4,2,...,2,1,0,1,1,1,1,1,1,0
4,19,92,6,65,3,4,70,3,4,2,...,0,1,1,0,2,0,2,1,1,0


In [41]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix, classification_report
import numpy as np
# Drop any rows with missing values (optional, if your data has NaNs)
df = df.dropna()


# Split the dataset into input features (X) and target variable (y)
y = df['Exam_Score_cgpa']
X = df.drop(columns=['Exam_Score', 'Exam_Score_cgpa', 'Previous_Scores'])

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a RandomForestRegressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the training data
model.fit(X_train, y_train)

# Predict on the test data
y_pred = model.predict(X_test)

# Evaluate the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")

y_pred = [round(x) for x in y_pred]

print('After adjustment:\n')

# Evaluate the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")

# Optionally, print actual vs predicted values
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
comparison_df.head()

Mean Squared Error: 0.07170219364599092
Root Mean Squared Error: 0.2677726529091254
After adjustment:

Mean Squared Error: 0.08698940998487141
Root Mean Squared Error: 0.29493967177182423


Unnamed: 0,Actual,Predicted
743,3,3
5551,3,3
3442,4,4
6571,3,3
4204,3,3


In [42]:
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Generate a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

# Generate a classification report (Precision, Recall, F1-Score)
class_report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(class_report)




# Optionally, print actual vs predicted values
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
comparison_df.head()

Accuracy: 91.30%

Confusion Matrix:
[[  2   9   0]
 [  1 921  41]
 [  0  64 284]]

Classification Report:
              precision    recall  f1-score   support

           2       0.67      0.18      0.29        11
           3       0.93      0.96      0.94       963
           4       0.87      0.82      0.84       348

    accuracy                           0.91      1322
   macro avg       0.82      0.65      0.69      1322
weighted avg       0.91      0.91      0.91      1322



Unnamed: 0,Actual,Predicted
743,3,3
5551,3,3
3442,4,4
6571,3,3
4204,3,3
