In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import root_mean_squared_error

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [3]:
DATA = "StudentPerformanceFactors.csv"

# ~~~~~~~~~~~~~~~~
# DATA EXPLORATION |
# ~~~~~~~~~~~~~~~~

In [4]:
df = pd.read_csv(DATA)

In [5]:
df.head(10)

In [6]:
df.describe()

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Dealing with Entries with Missing Values |
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In [7]:
# Missing Values
missing_values_count = df.isnull().sum()
print(missing_values_count)

In [8]:
missing_percentage = (df.isnull().sum() / len(df)) * 100
print(missing_percentage)

In [9]:
# Since the Missing values for every column is Less than 5 percent, and they are MCAR, I will drop the said rows for a complete dataset
df = df.dropna(axis=0)

In [10]:
missing_values_count = df.isnull().sum()
print(missing_values_count)

# ~~~~~~~~~~~~~~~~
# DATA VISUALIZATION|
# ~~~~~~~~~~~~~~~~

In [11]:
# Plotting the Study to Hours Studied Relation

studied_hours = df['Hours_Studied']
test_scores   = df['Exam_Score']


In [12]:
# sCATTER PLOT

# plt.scatter(test_scores, studied_hours, alpha=0.5)
sns.regplot(x=test_scores, y=studied_hours, data=df,
            scatter_kws={"color": "darkblue", "alpha": 0.7},
            line_kws={"color":"red", "linestyle":"--"})


plt.title("Score to Study Hours Plot")
plt.xlabel("Test Scores")
plt.ylabel("Hours Studied")

plt.show()


# ~~~~~~~~~~~~~~~~~~~~
# Prepping Data for Modeling | 
# ~~~~~~~~~~~~~~~~~~~~

In [13]:
# Loading in the Features (X) and Target (y)

X = df[['Hours_Studied']]
y = df['Exam_Score']


In [14]:
# Splitting the said data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Printing the Size
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Training the linear Model    | via Linear regression
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Studying HOurs + Exam Scores

In [15]:
# Training on the X and y data
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# ~~~~~~~~~~~~~~~~~~~~
# Evaluating the linear Model    |
# ~~~~~~~~~~~~~~~~~~~~

In [16]:
# Predictions on the X
predictions = linear_model.predict(X_test)

In [17]:
# Eval the output

mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error on Test Data: {mse}")

In [18]:
rmse = np.sqrt(mse)
print(f"RMSE: {rmse:.2f}")

In [19]:
# Make predictions on the test set
y_pred = linear_model.predict(X_test)

# Calculate the Root Mean Squared Error (RMSE)
rmse = root_mean_squared_error(y_test, y_pred)
print(f"Root Mean Squared Error: {rmse}")

# Print the R-squared score
r_squared = linear_model.score(X_test, y_test)
print(f"R-squared Score: {r_squared}")

In [20]:
# Visualizing the result

sns.regplot(x=predictions, y=y_test, data=df,
            scatter_kws={"color": "darkblue", "alpha": 0.7},
            line_kws={"color":"red", "linestyle":"--"})
# plt.scatter(predictions, y_test, label='Predictions')

plt.title("Results Visualization")
plt.xlabel("Actual Test Scores")
plt.ylabel("Predicted Test Scores")

# plt.legend()
plt.grid(True)
plt.show()

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Training the linear Model    | via Polynomial regression
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Studying Hours + Exam scores

In [21]:
df.head()

preparing the dataset

In [22]:
# No Need to Define X, and y, Test and train variables as already done prior
# Now Calculating for c(x^2)

poly_features = PolynomialFeatures(degree=2, include_bias=False)# Bias is False cuz Linear Regression will add it 

X_poly_train = poly_features.fit_transform(X_train)
X_poly_test  = poly_features.transform(X_test)

# Training Model

In [23]:
poly_model = LinearRegression()
poly_model.fit(X_poly_train, y_train)

# ~~~~~~~~~~~~~~~~~~~~~~~~
# Evaluating the Polynomial Model    |
# ~~~~~~~~~~~~~~~~~~~~~~~~

In [24]:
# Making PRedictions
p_predictions = poly_model.predict(X_poly_test)
 
# Eval the model
p_rmse = root_mean_squared_error(y_test, p_predictions)

# R-SQUARE
p_r_square = poly_model.score(X_poly_test,y_test)


print(f"Root Mean Squared Error on Test Data is:{p_rmse}")
print(f"Root Square for Polynomial Reg Model is: {p_r_square:.2f}")


In [25]:
sns.regplot(
    x=p_predictions,
    y=y_test,
    data=df,
    order=2,
    ci=None,
    color='green',  # You can customize the color here
    scatter_kws={'color': 'blue'} # Customize scatter plot color
)

# Add labels and a title to the plot
plt.xlabel("Predicted")
plt.ylabel("Exam Score")
plt.title("Polynomial Regression")


# Display the plot
plt.show()

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Conclusion
## Linear Regression vs Polynomial Regression

As seen in the code above, the accuracy for Linear Model is Given Below:

- Root Mean Squared Error: 3.5145033503249836
- R-squared Score: 0.20513060832466712

and, for the Polynomial Model is:

- Root Mean Squared Error on Test Data is:3.5146186440091056
- Root Square for Polynomial Reg Model is: 0.21

Thus, both the results are similar with hardly any difference Given that the only used features are the Study hours and Exam Scores.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Finding the Best Combination of Features for most accurate results
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# ~~~~~~~~~~~~~~~~~~~~~
# Prepping Data for Modeling |
# ~~~~~~~~~~~~~~~~~~~~~

In [26]:
df.columns

In [27]:
# Loading in the Features (X) and Target (y)

X = df[['Hours_Studied', 'Attendance', 'Sleep_Hours', 'Physical_Activity', 'Tutoring_Sessions']]
y = df['Exam_Score']


In [28]:
# Splitting the said data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Printing the Size
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

# ~~~~~~~~~~~~~~~~~~~~~
# Visualizing Data for Modeling |
# ~~~~~~~~~~~~~~~~~~~~~

In [29]:
# Create a new DataFrame with only the features you want to visualize, plus the target variable
data_to_plot = df[['Hours_Studied', 'Attendance', 'Sleep_Hours', 'Physical_Activity', 'Tutoring_Sessions', 'Exam_Score']]

# Use seaborn's pairplot function
sns.pairplot(data_to_plot)
plt.show()

In [30]:
# Calculate the correlation matrix
correlation_matrix = df[['Hours_Studied', 'Attendance', 'Sleep_Hours', 'Physical_Activity', 'Tutoring_Sessions', 'Exam_Score']].corr()

# Create a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Features and Target')
plt.show()


# ~~~~~~~~~~~~~~~~~
# Training Linear Model |
# ~~~~~~~~~~~~~~~~~

In [31]:
linear_model2 = LinearRegression()
linear_model2.fit(X_train, y_train)

# ``````````````````````
# Evaluating the Model
# ``````````````````````

In [32]:
# Predictions on the X
predictions2 = linear_model2.predict(X_test)

In [33]:
# Make predictions on the test set
y_pred = linear_model2.predict(X_test)

# Calculate the Root Mean Squared Error (RMSE)
rmse2 = root_mean_squared_error(y_test, y_pred)
print(f"Root Mean Squared Error: {rmse2}")

# Print the R-squared score
r_squared = linear_model2.score(X_test, y_test)
print(f"R-squared Score: {r_squared:.2f}")

# ``````````````````````
# Visualizing the result
# ``````````````````````

In [34]:
# Visualizing the result

sns.regplot(x=predictions2, y=y_test, data=df,
            scatter_kws={"color": "blue", "alpha": 0.7},
            line_kws={"color":"red", "linestyle":"--"})
# plt.scatter(predictions, y_test, label='Predictions')

plt.title("Results Visualization")
plt.xlabel("Predicted Test Scores")
plt.ylabel("Actual Test Scores")

# plt.legend()
plt.grid(True)
plt.show()

# ````````````````````
# Transforming Data for  Poly Model |
# ````````````````````

In [35]:
# No Need to Define X, and y, Test and train variables as already done prior
# Now Calculating for c(x^2)

poly_features2 = PolynomialFeatures(degree=2, include_bias=False)# Bias is False cuz Linear Regression will add it 

X_poly_train = poly_features.fit_transform(X_train)
X_poly_test  = poly_features.transform(X_test)

# ``````````````````````
# Training Polynomial |
# ``````````````````````

In [36]:
poly_model2 = LinearRegression()
poly_model2.fit(X_poly_train, y_train)

# ``````````````````````
# Evaluating Model|
# ``````````````````````

In [37]:
# Making Predictions
p_predictions2 = poly_model2.predict(X_poly_test)
 
# Eval the model
p_rmse2 = root_mean_squared_error(y_test, p_predictions2)

# R-SQUARE
p_r_square2 = poly_model2.score(X_poly_test,y_test)


print(f"Root Mean Squared Error on Test Data is:{p_rmse2}")
print(f"Root Square for Polynomial Reg Model is: {p_r_square2:.2f}")


In [38]:
sns.regplot(
    x=p_predictions2, y=y_test, data=df, order=2,
    color='green', scatter_kws={'color': 'blue'},
    line_kws={"color":"red", "linestyle":"--"})# Customize scatter plot color


# Add labels and a title to the plot
plt.xlabel("Predicted")
plt.ylabel("Exam Score")
plt.title("Polynomial Regression")


# Display the plot
plt.grid(True)
plt.show()

# ````````````````````````````````````````````````````````````````````````````````````
# Note
## Additional Features

As seen in the code above, the accuracy for Models with more features have significantly more accurate results then the one with only sleep hours and giving result as close as:

- Root Mean Squared Error: 2.5
- R-squared Score: 0.58

Now adding Features with string values by encoding them.
# ````````````````````````````````````````````````````````````````````````````````````

# `````````````````````````
#  Train Test Split 
# `````````````````````````

In [39]:
df['Hours_Studied'].dtype

In [40]:
# FIRST OF ALL Loading in the Features (X) and Target (y)

X = df[['Hours_Studied', 'Attendance', 'Parental_Involvement', 'Access_to_Resources', 'Sleep_Hours', 'School_Type', 'Tutoring_Sessions', 'Internet_Access', 'Family_Income', 'Physical_Activity']]
y = df['Exam_Score']

# So the Features with Ordinal Data are: Parental_Involvement, Access_to_Resources, School_Type, Internet_Access, Family_Income, 

In [41]:
# Splitting the said data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Sizes for reference
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

# ```````````````````````````````````
#  Feature Sorting for proper Encoding
# ```````````````````````````````````

In [42]:
# Sorting Data
object_cols = X.select_dtypes(include='object').columns.tolist()

numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
ordinal_cols   = ['Parental_Involvement', 'Access_to_Resources', 'Family_Income']
nominal_cols   = [col for col in object_cols if col not in ordinal_cols]

In [43]:
print(f"Numerical Columns: {numerical_cols}\n")
print(f"Ordinal Columns: {ordinal_cols}\n")
print(f"Nominal Columns: {nominal_cols}\n")

# ````````````````````
#  Encoding Method
# ````````````````````

In [44]:
preprocessor = ColumnTransformer(
    transformers=[
        ('ordinal', OrdinalEncoder(), ordinal_cols),
        ('onehot' , OneHotEncoder(handle_unknown='ignore'), nominal_cols),
        ('numerical', 'passthrough', numerical_cols)
    ],
    remainder='drop' # Numerical columns are untouched
)

In [45]:
X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded  = preprocessor.transform(X_test)

In [46]:
X_train_encoded.shape

# `````````````````````````
#  Training Linear Model
# `````````````````````````

In [47]:
linear_model_max = LinearRegression()
linear_model_max.fit(X_train_encoded, y_train)

# ``````````````````````
#  Evaling Linear Model
# ``````````````````````

In [48]:
# Predictions on the X
predictions = linear_model_max.predict(X_test_encoded)

In [49]:
# Eval the output

mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error on Test Data: {mse}")

In [50]:
# Make predictions on the test set
y_pred = linear_model_max.predict(X_test_encoded)

# Calculate the Root Mean Squared Error (RMSE)
rmse = root_mean_squared_error(y_test, y_pred)
print(f"Root Mean Squared Error: {rmse}")

# Print the R-squared score
r_squared = linear_model_max.score(X_test_encoded, y_test)
print(f"R-squared Score: {r_squared:.2f}")

# `````````````````````````
#  Training Polynomial Model
# `````````````````````````

In [51]:
poly_features_max = PolynomialFeatures(degree=2, include_bias=False)

X_poly_train_max = poly_features_max.fit_transform(X_train_encoded)
X_poly_test_max = poly_features_max.transform(X_test_encoded)

In [52]:
poly_model_max = LinearRegression()

poly_model_max.fit(X_poly_train_max, y_train)

In [53]:
p_predictions_max = poly_model_max.predict(X_poly_test_max)

mser = root_mean_squared_error(y_test,p_predictions_max)
print(f"R-Squared on test data is: {mser}")

print(f"Score on the test data is: {poly_model_max.score(X_poly_test_max, y_test):.2f}")

# ````````````````````````````````````````````````````````````````````````````````````
# Final Conclusion

As seen in the above code, adding object features raised the accuracy by noticeable amount and thus by using the Polynomial Regression we are receiving relatively better results.
# ````````````````````````````````````````````````````````````````````````````````````

# ``````````````````````````````
#  Testing the Model on new data
# ``````````````````````````````

In [54]:
X.columns

In [55]:
new_student = pd.DataFrame({
    'Hours_Studied': [12], 
    'Attendance': [95],
    'Parental_Involvement': ['High'],
    'Access_to_Resources': ['High'], 
    'Sleep_Hours': [6], 
    'School_Type': ['Public'],
    'Tutoring_Sessions': [1], 
    'Internet_Access':["Yes"], 
    'Family_Income':["Medium"],
    'Physical_Activity': [3]
})


In [56]:
new_student_encoded = preprocessor.transform(new_student)

In [57]:
new_student_final = poly_features_max.transform(new_student_encoded)

In [58]:
# THIS IS CORRECT
predicted_score = poly_model_max.predict(new_student_final) # Step 3

In [59]:
print(f"Marks of the Student are projected to be around: {predicted_score[0]:.1f}")

In [60]:
print("Hello is this")

In [61]:
# joblib.dump(preprocessor, 'preprocessor.joblib')
# 
# # Save the polynomial features transformer
# joblib.dump(poly_features_max, 'poly_features_max.joblib')
# 
# # Save the trained linear model
# joblib.dump(poly_model_max, 'poly_model_max.joblib')
# 
# print("All model components have been saved successfully.")

In [63]:
X_train

In [64]:
X_train_encoded

In [65]:
X_poly_train