In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import f_oneway
from scipy.stats import ttest_ind
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm


In [1]:
from google.colab import files
upload = files.upload()

Saving StudentPerformanceFactors.csv to StudentPerformanceFactors.csv


In [3]:
df=pd.read_csv('StudentPerformanceFactors.csv')
df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


In [4]:
# Define categorical columns to encode
categorical_cols = [
    'Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities',
    'Internet_Access', 'School_Type', 'Peer_Influence', 'Gender',
    'Learning_Disabilities', 'Parental_Education_Level', 'Distance_from_Home',
    'Motivation_Level', 'Family_Income', 'Teacher_Quality'
]

# Encode categorical variables
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# Define features and target
X = df.drop(columns=['Exam_Score'])
y = df['Exam_Score']

# Split into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

# Optional: show some predictions vs true scores
results = pd.DataFrame({'True': y_test, 'Predicted': y_pred})
print(results.head())


Mean Squared Error (MSE): 4.90
R^2 Score: 0.65
      True  Predicted
743     65      64.66
5551    65      66.41
3442    71      70.32
6571    64      66.30
4204    66      66.88


In [5]:


categorical_cols = [
    'Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities',
    'Internet_Access', 'School_Type', 'Peer_Influence', 'Gender',
    'Learning_Disabilities', 'Parental_Education_Level', 'Distance_from_Home',
    'Motivation_Level', 'Family_Income', 'Teacher_Quality'
]

# Data for 3 new students
new_students = [
    #rokaya
    {
        'Hours_Studied': 5,
        'Attendance': 50,
        'Parental_Involvement': 'Low',
        'Access_to_Resources': 'High',
        'Extracurricular_Activities': 'No',
        'Sleep_Hours': 7,
        'Previous_Scores': 75,
        'Motivation_Level': 'Low',
        'Internet_Access': 'Yes',
        'Tutoring_Sessions': 1,
        'Family_Income': 'Medium',
        'Teacher_Quality': 'High',
        'School_Type': 'Public',
        'Peer_Influence': 'Negative',
        'Physical_Activity': 3,
        'Learning_Disabilities': 'No',
        'Parental_Education_Level': 'College',
        'Distance_from_Home': 'Near',
        'Gender': 'Female'
    },
    {
        #mariam
        'Hours_Studied': 30,
        'Attendance': 30,
        'Parental_Involvement': 'High',
        'Access_to_Resources': 'Medium',
        'Extracurricular_Activities': 'No',
        'Sleep_Hours': 6,
        'Previous_Scores': 60,
        'Motivation_Level': 'High',
        'Internet_Access': 'Yes',
        'Tutoring_Sessions': 0,
        'Family_Income': 'High',
        'Teacher_Quality': 'Medium',
        'School_Type': 'Private',
        'Peer_Influence': 'Neutral',
        'Physical_Activity': 2,
        'Learning_Disabilities': 'No',
        'Parental_Education_Level': 'High School',
        'Distance_from_Home': 'Moderate',
        'Gender': 'Female'
    },
    {
        #nour
        'Hours_Studied': 20,
        'Attendance': 90,
        'Parental_Involvement': 'High',
        'Access_to_Resources': 'High',
        'Extracurricular_Activities': 'Yes',
        'Sleep_Hours': 8,
        'Previous_Scores': 85,
        'Motivation_Level': 'Medium',
        'Internet_Access': 'Yes',
        'Tutoring_Sessions': 2,
        'Family_Income': 'High',
        'Teacher_Quality': 'High',
        'School_Type': 'Public',
        'Peer_Influence': 'Negative',
        'Physical_Activity': 4,
        'Learning_Disabilities': 'No',
        'Parental_Education_Level': 'Postgraduate',
        'Distance_from_Home': 'Far',
        'Gender': 'Female'
    }
]

# Convert to DataFrame
input_df = pd.DataFrame(new_students)

# Encode categorical features (using LabelEncoder fit on training data)
# IMPORTANT: You should save encoders from training for production use.
# Here we refit just for demonstration combining train + new input:
for col in categorical_cols:
    le = LabelEncoder()
    le.fit(list(df[col]) + list(input_df[col]))
    input_df[col] = le.transform(input_df[col])

# Predict exam scores for all 3
predictions = model.predict(input_df)

# Show results
for i, pred in enumerate(predictions, start=1):
    print(f"Predicted Exam Score for Person {i}: {pred:.2f}")


Predicted Exam Score for Person 1: 58.97
Predicted Exam Score for Person 2: 66.81
Predicted Exam Score for Person 3: 70.62


# Conclusion
The model achieved a Mean Squared Error (MSE) of 4.70, which means that, on average, the squared
difference between the predicted and actual exam scores is 4.7 points — a relatively low error given
the nature of the data. Additionally, the model's R² score was 0.67, indicating that the model explains
67% of the variance in students' exam scores. This suggests the model captures the main patterns in the
data and performs reasonably well, although there is still room for improvement. We then used this
trained model to predict exam scores for three new students based on their individual profiles.


In [6]:
df.describe()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
count,6607.0,6607.0,6607.0,6607.0,6607.0,6607.0,6607.0,6607.0,6607.0,6607.0,6607.0,6607.0,6607.0,6607.0,6607.0,6607.0,6607.0,6607.0,6607.0,6607.0
mean,19.975329,79.977448,1.22007,1.203421,0.596035,7.02906,75.070531,1.307553,0.924474,1.493719,1.211442,1.322991,0.695929,1.190858,2.96761,0.105191,0.923717,1.508552,0.577267,67.235659
std,5.990594,11.547475,0.865634,0.871783,0.490728,1.46812,14.399784,0.782515,0.264258,1.23057,0.742264,0.91202,0.460048,0.755876,1.031231,0.306823,0.739806,0.685173,0.494031,3.890456
min,1.0,60.0,0.0,0.0,0.0,4.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55.0
25%,16.0,70.0,0.0,0.0,0.0,6.0,63.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,65.0
50%,20.0,80.0,2.0,2.0,1.0,7.0,75.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,3.0,0.0,1.0,2.0,1.0,67.0
75%,24.0,90.0,2.0,2.0,1.0,8.0,88.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0,4.0,0.0,1.0,2.0,1.0,69.0
max,44.0,100.0,2.0,2.0,1.0,10.0,100.0,2.0,1.0,8.0,2.0,3.0,1.0,2.0,6.0,1.0,3.0,3.0,1.0,101.0


In [7]:
df_clean = df.dropna()

In [8]:

# Replace all NaN values with the mode of each column
for column in df.columns:
    if df[column].isnull().any():
        df[column].fillna(df[column].mode()[0], inplace=True)

# Optional: Check if any missing values remain
print(df.isnull().sum())

Hours_Studied                 0
Attendance                    0
Parental_Involvement          0
Access_to_Resources           0
Extracurricular_Activities    0
Sleep_Hours                   0
Previous_Scores               0
Motivation_Level              0
Internet_Access               0
Tutoring_Sessions             0
Family_Income                 0
Teacher_Quality               0
School_Type                   0
Peer_Influence                0
Physical_Activity             0
Learning_Disabilities         0
Parental_Education_Level      0
Distance_from_Home            0
Gender                        0
Exam_Score                    0
dtype: int64
