In [21]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression

complete_data = pd.read_csv("/content/StudentPerformanceFactors.csv")

# Project Check-in 2

**Non-Discrete Numeric Response Variable**
  - Exam_Score

**Predictor variables** (Chosen from EDA in check-in 1)
  - Hours_Studied
  - Attendance




In [16]:
# First, take code from previous check-in to clean the data to have our get our updated data frame
# Although during this regression modeling, we will only use a subset of these rows, they are all included to create our training, validation, and test data sets\

main_features = ['Hours_Studied', 'Attendance', 'Previous_Scores', 'Parental_Involvement', 'Learning_Disabilities', 'Access_to_Resources', 'Distance_from_Home', 'Tutoring_Sessions', 'Exam_Score']
data = complete_data[main_features]
data = data.dropna(subset=['Distance_from_Home'])

# Next, divide the new dataframe into 3 different data sets using a 60:20:20 split
# We chose 60:20:20 as opposed to 80:10:10 or somwhere in between to decrease the liklihood of overfitting, since the metrics used are potentially subseptible to overfitting


train_and_validation_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

train_df, validation_df = train_test_split(train_and_validation_df, test_size=0.25, random_state=42)

train_df.to_csv('Student_Performance_train.csv', index=False)
validation_df.to_csv('Student_Performance_validation.csv', index=False)
test_df.to_csv('Student_Performance_test.csv', index=False)


# Regression Modeling & Evaluation

In [37]:
# First, split up the training and validation sets into our X and Y data frames

x_train = train_df[["Hours_Studied", "Attendance"]]
y_train = train_df["Exam_Score"]

x_validation = validation_df[["Hours_Studied", "Attendance"]]
y_validation = validation_df["Exam_Score"]

# Create linear regression for each independent predictor
ls_fit_hours = LinearRegression()
ls_fit_hours.fit(x_train[["Hours_Studied"]], y_train)

ls_fit_attendance = LinearRegression()
ls_fit_attendance.fit(x_train[["Attendance"]], y_train)

# Plot LS on top of scatter plot of hours_studied vs exam score
fig = px.scatter(data, x='Hours_Studied', y='Exam_Score', title='Hours Studied vs Exam Score')
fig.add_trace(go.Scatter(x=x_train['Hours_Studied'], y=ls_fit_hours.intercept_ + train_df['Hours_Studied'] * ls_fit_hours.coef_[0], mode='lines', name='Least Squares Fit'))
# fig.show()

# Plot LS on top of scatter plot of Attendance vs exam score
fig = px.scatter(data, x='Attendance', y='Exam_Score', title='Attendance vs Exam Score')
fig.add_trace(go.Scatter(x=x_train['Attendance'], y=ls_fit_attendance.intercept_ + train_df['Attendance'] * ls_fit_attendance.coef_[0], mode='lines', name='Least Squares Fit'))
# fig.show()

# Create model for multiple linear regression using least squares difference

mls_fit = LinearRegression()
mls_fit.fit(x_train, y_train)

# Calculate metrics for training set on all 3 models
hours_train_mse = mean_squared_error(y_train, ls_fit_hours.predict(x_train[["Hours_Studied"]]))
attendance_train_mse = mean_squared_error(y_train, ls_fit_attendance.predict(x_train[["Attendance"]]))
mls_train_mse = mean_squared_error(y_train, mls_fit.predict(x_train))

hours_train_r2 = r2_score(y_train, ls_fit_hours.predict(x_train[["Hours_Studied"]]))
attendance_train_r2 = r2_score(y_train, ls_fit_attendance.predict(x_train[["Attendance"]]))
mls_train_r2 = r2_score(y_train, mls_fit.predict(x_train))

hours_train_correlation = np.corrcoef(y_train, ls_fit_hours.predict(x_train[["Hours_Studied"]]))[0, 1]
attendance_train_correlation = np.corrcoef(y_train, ls_fit_attendance.predict(x_train[["Attendance"]]))[0, 1]
mls_train_correlation = np.corrcoef(y_train, mls_fit.predict(x_train))[0, 1]

print(f"Hours MSE: {hours_train_mse}")
print(f"Attendance MSE: {attendance_train_mse}")
print(f"Multiple Linear MSE: {mls_train_mse}\n")
print(f"Hours R2: {hours_train_r2}")
print(f"Attendance R2: {attendance_train_r2}")
print(f"Multiple Linear R2: {mls_train_r2}\n")
print(f"Hours Correlation: {hours_train_correlation}")
print(f"Attendance Correlation: {attendance_train_correlation}")
print(f"Multiple Linear Correlation: {mls_train_correlation}")



Hours MSE: 12.489959637339545
Attendance MSE: 10.1863752702403
Multiple Linear MSE: 7.130350492920021

Hours R2: 0.1892357052200484
Attendance R2: 0.3387689310339925
Multiple Linear R2: 0.5371455347506965

Hours Correlation: 0.4350123046766012
Attendance Correlation: 0.5820385992646815
Multiple Linear Correlation: 0.7329021317684211


In [38]:
# Evaluate metrics on validation set using training models

ls_fit_hours_val_mse = mean_squared_error(y_validation, ls_fit_hours.predict(x_validation[["Hours_Studied"]]))
ls_fit_attendance_val_mse = mean_squared_error(y_validation, ls_fit_attendance.predict(x_validation[["Attendance"]]))
mls_fit_val_mse = mean_squared_error(y_validation, mls_fit.predict(x_validation))

ls_fit_hours_val_r2 = r2_score(y_validation, ls_fit_hours.predict(x_validation[["Hours_Studied"]]))
ls_fit_attendance_val_r2 = r2_score(y_validation, ls_fit_attendance.predict(x_validation[["Attendance"]]))
mls_fit_val_r2 = r2_score(y_validation, mls_fit.predict(x_validation))

ls_fit_hours_val_correlation = np.corrcoef(y_validation, ls_fit_hours.predict(x_validation[["Hours_Studied"]]))[0, 1]
ls_fit_attendance_val_correlation = np.corrcoef(y_validation, ls_fit_attendance.predict(x_validation[["Attendance"]]))[0, 1]
mls_fit_val_correlation = np.corrcoef(y_validation, mls_fit.predict(x_validation))[0, 1]

print(f"Hours MSE: {ls_fit_hours_val_mse}")
print(f"Attendance MSE: {ls_fit_attendance_val_mse}")
print(f"Multiple Linear MSE: {mls_fit_val_mse}\n")
print(f"Hours R2: {ls_fit_hours_val_r2}")
print(f"Attendance R2: {ls_fit_attendance_val_r2}")
print(f"Multiple Linear R2: {mls_fit_val_r2}\n")
print(f"Hours Correlation: {ls_fit_hours_val_correlation}")
print(f"Attendance Correlation: {ls_fit_attendance_val_correlation}")
print(f"Multiple Linear Correlation: {mls_fit_val_correlation}")

Hours MSE: 11.259793379444265
Attendance MSE: 9.481154435783322
Multiple Linear MSE: 6.587955501190167

Hours R2: 0.21112573206309848
Attendance R2: 0.33573925269537863
Multiple Linear R2: 0.5384401473396503

Hours Correlation: 0.4596558521221392
Attendance Correlation: 0.580086543878091
Multiple Linear Correlation: 0.7354373741282756
