In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, roc_curve,roc_auc_score

from sklearn.linear_model import LogisticRegression

complete_data = pd.read_csv("StudentPerformanceFactors.csv")

In [27]:
!pip install --upgrade numpy
!pip install --upgrade pandas
!pip install --upgrade scikit-learn
!pip install --upgrade matplotlib
!pip install --upgrade nbformat

Collecting nbformat
  Using cached nbformat-5.10.4-py3-none-any.whl.metadata (3.6 kB)
Collecting fastjsonschema>=2.15 (from nbformat)
  Using cached fastjsonschema-2.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting jsonschema>=2.6 (from nbformat)
  Using cached jsonschema-4.23.0-py3-none-any.whl.metadata (7.9 kB)
Collecting attrs>=22.2.0 (from jsonschema>=2.6->nbformat)
  Using cached attrs-24.2.0-py3-none-any.whl.metadata (11 kB)
Collecting jsonschema-specifications>=2023.03.6 (from jsonschema>=2.6->nbformat)
  Using cached jsonschema_specifications-2024.10.1-py3-none-any.whl.metadata (3.0 kB)
Collecting referencing>=0.28.4 (from jsonschema>=2.6->nbformat)
  Using cached referencing-0.35.1-py3-none-any.whl.metadata (2.8 kB)
Collecting rpds-py>=0.7.1 (from jsonschema>=2.6->nbformat)
  Using cached rpds_py-0.20.0-cp311-cp311-macosx_10_12_x86_64.whl.metadata (4.2 kB)
Using cached nbformat-5.10.4-py3-none-any.whl (78 kB)
Using cached fastjsonschema-2.20.0-py3-none-any.whl (23 kB)
Using 

In [8]:
# First, take code from previous check-in to clean the data to have our get our updated data frame
# Although during this regression modeling, we will only use a subset of these rows, they are all included to create our training, validation, and test data sets\
# These are the features we selected at the beginning of the project to use in our modeling

# 1) "Learning_Disabilities" is a binary feature, so we'll use this as our response variable
# 2) I'll be using "Exam_Score" as our predictor for this classification model

#putting column name in a variable for now cuz idk which were using
predictor_variable = "Exam_Score"
response_variable = 'Learning_Disabilities'
main_features = [response_variable, predictor_variable]
data = complete_data[main_features]

# using the same code from the previous check-in, we'll split the data into training, validation, and test sets
# Next, divide the new dataframe into 3 different data sets using a 60:20:20 split
# We chose 60:20:20 as opposed to 80:10:10 or somwhere in between to decrease the liklihood of overfitting, since the metrics used are potentially subseptible to overfitting


train_and_validation_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

train_df, validation_df = train_test_split(train_and_validation_df, test_size=0.25, random_state=42)

train_df.to_csv('Student_Performance_train.csv', index=False)
validation_df.to_csv('Student_Performance_validation.csv', index=False)
test_df.to_csv('Student_Performance_test.csv', index=False)

# making the response variable have a binary encoding
train_df[response_variable] = train_df[response_variable].map({'No': 0, 'Yes': 1})
validation_df[response_variable] = validation_df[response_variable].map({'No': 0, 'Yes': 1})
test_df[response_variable] = test_df[response_variable].map({'No': 0, 'Yes': 1})


x_train = train_df[[predictor_variable]]
y_train = train_df[response_variable]

#training the model
model = LogisticRegression()
lr_model = model.fit(x_train, y_train)
#making predictions
y_pred = lr_model.predict(x_train)

#calculating the confusion matrix and other metrics
c_m = confusion_matrix(y_train, y_pred)
print(c_m)

accuracy = accuracy_score(y_train, y_pred)
prediction_error = 1 - accuracy
true_positive_rate = recall_score(y_train, y_pred)  # TPR
true_negative_rate = recall_score(y_train, y_pred, pos_label=0)  # TNR

print(f"Accuracy: {accuracy}")
print(f"Prediction Error: {prediction_error}")
print(f"True Positive Rate (TPR): {true_positive_rate}")
print(f"True Negative Rate (TNR): {true_negative_rate}")

# 5-fold cross-validation on validation set
X_val = validation_df[[predictor_variable]]
y_val = validation_df[response_variable]
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# calculate auc and accuracy accross the folds
auc_scores = cross_val_score(model, X_val, y_val, cv=cv, scoring='roc_auc')
accuracy_scores = cross_val_score(model, X_val, y_val, cv=cv, scoring='accuracy')

print("AUC Scores for each fold:", auc_scores)
print("Average AUC:", np.mean(auc_scores))
print("Accuracy Scores for each fold:", accuracy_scores)
print("Average Accuracy:", np.mean(accuracy_scores))

# plot auc curve
model.fit(X_val, y_val)  # Fit model to validation data for ROC curve plotting
y_val_proba = model.predict_proba(X_val)[:, 1]  # Probability of the positive class

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_val, y_val_proba)
auc = roc_auc_score(y_val, y_val_proba)

fig = go.Figure()

fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'ROC Curve (AUC = {auc:.2f})'))
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', line=dict(dash='dash'), name='Random'))

fig.update_layout(
    title="ROC Curve on Validation Set",
    xaxis_title="False Positive Rate",
    yaxis_title="True Positive Rate",
    showlegend=True
)

fig.show()


[0 1]
[[3526    0]
 [ 437    0]]
Accuracy: 0.8897300025233409
Prediction Error: 0.11026999747665911
True Positive Rate (TPR): 0.0
True Negative Rate (TNR): 1.0
AUC Scores for each fold: [0.52060199 0.68205728 0.64921308 0.58327219 0.65495231]
Average AUC: 0.6180193686835939
Accuracy Scores for each fold: [0.89056604 0.89056604 0.89393939 0.89015152 0.89015152]
Average Accuracy: 0.8910748999428245
