In [23]:
# Imports 
import pandas as pd
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
import hvplot.pandas

In [24]:
# Read updated csv
df2 = pd.read_csv('Resources/Student_DataType_Conversion.csv')
df2.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,0,3,4,1,0,59,70,78
1,1,3,0,1,1,96,93,87
2,0,3,4,0,1,57,76,77
3,1,1,4,0,1,70,70,63
4,0,3,0,1,1,83,85,86


## Math Scores

In [25]:
# Split data into features and target arrays 
y = df2["math score"].values
X = df2.drop(columns="math score").values

# Instantiate the random undersampler model and fit the data 
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y) 

# Split data into training and testing dataset 
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=42)

# Instantiate the DecisionTreeRegressor and fit the data 
dtr = DecisionTreeRegressor(random_state=42)
dtr.fit(X_train, y_train)

# Make a prediction using X_test
y_pred = dtr.predict(X_test)

# Find the R-Squared score
r2 = r2_score(y_test, y_pred)
print("MATH R-Squared score:", round(r2, 2))

MATH R-Squared score: 0.82


In [26]:
# Convert the y_test and y_pred arrays to a DataFrame
df = pd.DataFrame({"Actual Values": y_test, "Predicted Values": y_pred})

# Fit a polynomial regression model to the data
p = np.polyfit(df["Actual Values"], df["Predicted Values"], 1)
reg_line = np.polyval(p, df["Actual Values"])

# Create an hvplot scatter plot with a regression line
scatter_plot = df.hvplot.scatter(x="Actual Values", y="Predicted Values", title=f"MATH r^2 score: {r2:.2f}")
reg_line_plot = pd.DataFrame({"Actual Values": df["Actual Values"], "Predicted Values": reg_line}).hvplot.line(x="Actual Values", y="Predicted Values", color="red")
scatter_plot * reg_line_plot

In [27]:
# Create a dictionary with new values 
new_data = {
    "gender": 1,
    "race/ethnicity": 3,
    "parental level of education": 5,
    "lunch": 1,
    "test preparation course": 0,
    "reading score": [91],
    "writing score": [88]
}

# Convert the dictionary into a DataFrame
new_df = pd.DataFrame(new_data)

# Make a predicition using new_df
math_pred = dtr.predict(new_df)

# Find the predicted math scorre
print("Predicted math score:", math_pred[0])
# Actual score: 82 (less by 1)

Predicted math score: 81.0




**Math Scores Analysis**
<hr>
An r2 score of 0.82 (82%) shows the model is a good fit for the data and can be used to make accurate predictions. However, it is import to note that the remaining 18% of the variance is unexplained and may be due to factors that are not included in the model. When we provided a new set of features to the model, it predicted a math score of 81, but the actual score was 82. 

## Reading Scores

In [28]:
# Split data into features and target arrays 
y = df2["reading score"].values
X = df2.drop(columns="reading score").values

# Instantiate the random undersampler model and fit the data 
# rus = RandomUnderSampler(random_state=42)
# X_resampled, y_resampled = rus.fit_resample(X, y) 

# ROS
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Split data into training and testing dataset 
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=42)

# Instantiate the DecisionTreeRegressor and fit the data 
dtr = DecisionTreeRegressor(random_state=42)
dtr.fit(X_train, y_train)

# Make a prediction using X_test
y_pred = dtr.predict(X_test)

# Find the R-Squared score
r2 = r2_score(y_test, y_pred)
print("READING R-Squared score:", round(r2, 2))

READING R-Squared score: 0.98


In [29]:
# Convert the y_test and y_pred arrays to a DataFrame
df = pd.DataFrame({"Actual Values": y_test, "Predicted Values": y_pred})

# Fit a polynomial regression model to the data
p = np.polyfit(df["Actual Values"], df["Predicted Values"], 1)
reg_line = np.polyval(p, df["Actual Values"])

# Create an hvplot scatter plot with a regression line
scatter_plot = df.hvplot.scatter(x="Actual Values", y="Predicted Values", title=f"READING r^2 score: {r2:.2f}")
reg_line_plot = pd.DataFrame({"Actual Values": df["Actual Values"], "Predicted Values": reg_line}).hvplot.line(x="Actual Values", y="Predicted Values", color="red")
scatter_plot * reg_line_plot

In [30]:
# Create a dictionary with the input values
new_data = {
    "gender": 1,
    "race/ethnicity": 3,
    "parental level of education": 5,
    "lunch": 1,
    "test preparation course": 0,
    "math score": [82],
    "writing score": [88]
}

# Convert the dictionary into a DataFrame
new_df = pd.DataFrame(new_data)

# Make a prediction using new_df
reading_pred = dtr.predict(new_df)

# Print the predicted writing score
print("Predicted reading score:", reading_pred[0])
# Actual score: 91 (less by 5) - RUS (lestt by 1) - ROS

Predicted reading score: 90.0




**Reading Scores Analysis - ROS**
<hr>
An r2 score of 0.98 (98%) shows the model is a good fit for the data and can be used to make accurate predictions. However, it is import to note that the remaining 2% of the variance is unexplained and may be due to factors that are not included in the model. When we provided a new set of features to the model, it predicted a reading score of 90, but the actual score was 91. 

## Writing Scores

In [31]:
# Split data into features and target arrays 
y = df2["writing score"].values
X = df2.drop(columns="writing score").values

# Instantiate the random undersampler model and fit the data 
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y) 

# Split data into training and testing dataset 
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=42)

# Instantiate the DecisionTreeRegressor and fit the data 
dtr = DecisionTreeRegressor(random_state=42)
dtr.fit(X_train, y_train)

# Make a prediction using X_test
y_pred = dtr.predict(X_test)

# Find the R-Squared score
r2 = r2_score(y_test, y_pred)
print("WRITING R-Squared score:", round(r2, 2))

WRITING R-Squared score: 0.94


In [32]:
# Convert the y_test and y_pred arrays to a DataFrame
df = pd.DataFrame({"Actual Values": y_test, "Predicted Values": y_pred})

# Fit a polynomial regression model to the data
p = np.polyfit(df["Actual Values"], df["Predicted Values"], 1)
reg_line = np.polyval(p, df["Actual Values"])

# Create an hvplot scatter plot with a regression line
scatter_plot = df.hvplot.scatter(x="Actual Values", y="Predicted Values", title=f"WRITING r^2 score: {r2:.2f}")
reg_line_plot = pd.DataFrame({"Actual Values": df["Actual Values"], "Predicted Values": reg_line}).hvplot.line(x="Actual Values", y="Predicted Values", color="red")
scatter_plot * reg_line_plot

In [33]:

# Create a dictionary with the input values
new_data = {
    "gender": 1,
    "race/ethnicity": 3,
    "parental level of education": 5,
    "lunch": 0,
    "test preparation course": 0,
    "math score": [82],
    "reading score": [91]
}

# Convert the dictionary into a DataFrame
new_df = pd.DataFrame(new_data)

# Use the trained model to make a prediction for the input data
writing_pred = dtr.predict(new_df)

# Print the predicted writing score
print("Predicted writing score:", writing_pred[0])
# Actual score: 88 (more by 8)

Predicted writing score: 96.0




**Writing Scores Analysis**
<hr>
An r2 score of 0.94 (94%) shows the model is a good fit for the data and can be used to make accurate predictions. However, it is import to note that the remaining 6% of the variance is unexplained and may be due to factors that are not included in the model. When we provided a new set of features to the model, it predicted a math score of 96, but the actual score was 88. 