In [45]:
# Imports 
import pandas as pd
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import hvplot.pandas

In [46]:
# Read updated csv
df2 = pd.read_csv('Resources/Student_DataType_Conversion.csv')
df2.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,0,3,4,1,0,59,70,78
1,1,3,0,1,1,96,93,87
2,0,3,4,0,1,57,76,77
3,1,1,4,0,1,70,70,63
4,0,3,0,1,1,83,85,86


## Math Scores

In [47]:
# Split data into features and target arrays 
y = df2["math score"].values
X = df2.drop(columns="math score").values

# Instantiate random oversampler and fit the data 
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y) 

# Split data into training and testing dataset 
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=42)

# Instantiate DecisionTreeRegressor and fit the data 
dtr = DecisionTreeRegressor(random_state=42)
dtr.fit(X_train, y_train)

# Make a prediction using X_test
y_pred = dtr.predict(X_test)

# Find the regression metrics
score = dtr.score(X_test, y_test, sample_weight=None)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
std = np.std(y_test)

# Print the regression metrics
print("MATH METRICS")
print(f"The score is {score:.2f}.")
print(f"R-Squared score:", round(r2, 2))
print(f"The mean squared error is {mse:.2f}.")
print(f"The root mean squared error is {rmse:.2f}.")
print(f"The standard deviation is {std:.2f}.")

MATH METRICS
The score is 0.97.
R-Squared score: 0.97
The mean squared error is 17.36.
The root mean squared error is 4.17.
The standard deviation is 23.38.


In [48]:
# Convert the y_test and y_pred arrays to a DataFrame
df = pd.DataFrame({"Actual Values": y_test, "Predicted Values": y_pred})

# Fit a polynomial regression model to the data
p = np.polyfit(df["Actual Values"], df["Predicted Values"], 1)
reg_line = np.polyval(p, df["Actual Values"])

# Create an hvplot scatter plot with a regression line
scatter_plot = df.hvplot.scatter(x="Actual Values", y="Predicted Values", title=f"MATH r^2 score: {r2:.2f}")
reg_line_plot = pd.DataFrame({"Actual Values": df["Actual Values"], "Predicted Values": reg_line}).hvplot.line(x="Actual Values", y="Predicted Values", color="red")
scatter_plot * reg_line_plot

In [49]:
# Create a dictionary with new values 
new_data = {
    "gender": 1,
    "race/ethnicity": 3,
    "parental level of education": 5,
    "lunch": 1,
    "test preparation course": 0,
    "reading score": [91],
    "writing score": [88]
}

# Convert the dictionary into a DataFrame
new_df = pd.DataFrame(new_data)

# Make a predicition using new_df
math_pred = dtr.predict(new_df)

# Find the predicted math scorre
print("Predicted math score:", math_pred[0])
# Actual score: 82 (more by 14)

Predicted math score: 96.0




**Math Scores Analysis**
<hr>

* The **score** of **0.97** is a good indication of the model's performance. 
* The **R-Squared** of **0.97** indicates the model is able to explain 97% of the variance in the target variable and is a very good fit.  
* The **MSE** of **17.36** indicates that the average squared difference between the predicted and actual values is relatively low. 
* The **RMSE** of **4.17** indicates the average difference between the predicted and actual values is relatively low. 
* The **STD** of **23.38** indicates the spread of the data. 
* Overall, the regression metrics shows the model is performing well in predicting math scores based on the features in the dataset. 

## Reading Scores

In [50]:
# Split data into features and target arrays 
y = df2["reading score"].values
X = df2.drop(columns="reading score").values

# Instantiate random oversampler and fit the data 
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Split data into training and testing dataset 
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=42)

# Instantiate DecisionTreeRegressor and fit the data 
dtr = DecisionTreeRegressor(random_state=42)
dtr.fit(X_train, y_train)

# Make a prediction using X_test
y_pred = dtr.predict(X_test)

# Find the regression metrics
score = dtr.score(X_test, y_test, sample_weight=None)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
std = np.std(y_test)

# Print the regression metrics
print("READING METRICS")
print(f"The score is {score:.2f}.")
print(f"R-Squared score:", round(r2, 2))
print(f"The mean squared error is {mse:.2f}.")
print(f"The root mean squared error is {rmse:.2f}.")
print(f"The standard deviation is {std:.2f}.")

READING METRICS
The score is 0.98.
R-Squared score: 0.98
The mean squared error is 11.04.
The root mean squared error is 3.32.
The standard deviation is 21.17.


In [51]:
# Convert the y_test and y_pred arrays to a DataFrame
df = pd.DataFrame({"Actual Values": y_test, "Predicted Values": y_pred})

# Fit a polynomial regression model to the data
p = np.polyfit(df["Actual Values"], df["Predicted Values"], 1)
reg_line = np.polyval(p, df["Actual Values"])

# Create an hvplot scatter plot with a regression line
scatter_plot = df.hvplot.scatter(x="Actual Values", y="Predicted Values", title=f"READING r^2 score: {r2:.2f}")
reg_line_plot = pd.DataFrame({"Actual Values": df["Actual Values"], "Predicted Values": reg_line}).hvplot.line(x="Actual Values", y="Predicted Values", color="red")
scatter_plot * reg_line_plot

In [52]:
# Create a dictionary with the input values
new_data = {
    "gender": 1,
    "race/ethnicity": 3,
    "parental level of education": 5,
    "lunch": 1,
    "test preparation course": 0,
    "math score": [82],
    "writing score": [88]
}

# Convert the dictionary into a DataFrame
new_df = pd.DataFrame(new_data)

# Make a prediction using new_df
reading_pred = dtr.predict(new_df)

# Print the predicted writing score
print("Predicted reading score:", reading_pred[0])
# Actual score: 91 (less by 1))

Predicted reading score: 90.0




**Reading Scores Analysis**
<hr>

* The **score** of **0.98** is a good indication of the model's performance. 
* The **R-Squared** of **0.98** indicates the model is able to explain 98% of the variance in the target variable and is a very good fit.  
* The **MSE** of **11.04** indicates that the average squared difference between the predicted and actual values is relatively low. 
* The **RMSE** of **3.32** indicates the average difference between the predicted and actual values is relatively low. 
* The **STD** of **21.17** indicates the spread of the data. 
* Overall, the regression metrics shows the model is performing well in predicting reading scores based on the features in the dataset. 

## Writing Scores

In [53]:
# Split data into features and target arrays 
y = df2["writing score"].values
X = df2.drop(columns="writing score").values

# Instantiate random oversampler and fit the data 
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Split data into training and testing dataset 
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=42)

# Instantiate DecisionTreeRegressor and fit the data 
dtr = DecisionTreeRegressor(random_state=42)
dtr.fit(X_train, y_train)

# Make a prediction using X_test
y_pred = dtr.predict(X_test)

# Find the regression metrics
score = dtr.score(X_test, y_test, sample_weight=None)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
std = np.std(y_test)

# Print the regression metrics
print("WRITING METRICS")
print(f"The score is {score:.2f}.")
print(f"R-Squared score:", round(r2, 2))
print(f"The mean squared error is {mse:.2f}.")
print(f"The root mean squared error is {rmse:.2f}.")
print(f"The standard deviation is {std:.2f}.")

WRITING METRICS
The score is 0.98.
R-Squared score: 0.98
The mean squared error is 7.98.
The root mean squared error is 2.82.
The standard deviation is 21.98.


In [54]:
# Convert the y_test and y_pred arrays to a DataFrame
df = pd.DataFrame({"Actual Values": y_test, "Predicted Values": y_pred})

# Fit a polynomial regression model to the data
p = np.polyfit(df["Actual Values"], df["Predicted Values"], 1)
reg_line = np.polyval(p, df["Actual Values"])

# Create an hvplot scatter plot with a regression line
scatter_plot = df.hvplot.scatter(x="Actual Values", y="Predicted Values", title=f"WRITING r^2 score: {r2:.2f}")
reg_line_plot = pd.DataFrame({"Actual Values": df["Actual Values"], "Predicted Values": reg_line}).hvplot.line(x="Actual Values", y="Predicted Values", color="red")
scatter_plot * reg_line_plot

In [55]:
# Create a dictionary with the input values
new_data = {
    "gender": 1,
    "race/ethnicity": 3,
    "parental level of education": 5,
    "lunch": 0,
    "test preparation course": 0,
    "math score": [82],
    "reading score": [91]
}

# Convert the dictionary into a DataFrame
new_df = pd.DataFrame(new_data)

# Use the trained model to make a prediction for the input data
writing_pred = dtr.predict(new_df)

# Print the predicted writing score
print("Predicted writing score:", writing_pred[0])
# Actual score: 88 (more by 10)

Predicted writing score: 98.0




**Writing Scores Analysis**
<hr>

* The **score** of **0.98** is a good indication of the model's performance. 
* The **R-Squared** of **0.98** indicates the model is able to explain 98% of the variance in the target variable and is a very good fit.  
* The **MSE** of **7.98** indicates that the average squared difference between the predicted and actual values is relatively low. 
* The **RMSE** of **2.82** indicates the average difference between the predicted and actual values is relatively low. 
* The **STD** of **21.98** indicates the spread of the data. 
* Overall, the regression metrics shows the model is performing well in predicting writing scores based on the features in the dataset. 