In [59]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# **Pre-Processing**

### **1. Load and Read Data**

In [60]:
df = pd.read_csv('../data/student_habits_performance.csv')

### **2. Inspect Data**

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   student_id                     1000 non-null   object 
 1   age                            1000 non-null   int64  
 2   gender                         1000 non-null   object 
 3   study_hours_per_day            1000 non-null   float64
 4   social_media_hours             1000 non-null   float64
 5   netflix_hours                  1000 non-null   float64
 6   part_time_job                  1000 non-null   object 
 7   attendance_percentage          1000 non-null   float64
 8   sleep_hours                    1000 non-null   float64
 9   diet_quality                   1000 non-null   object 
 10  exercise_frequency             1000 non-null   int64  
 11  parental_education_level       909 non-null    object 
 12  internet_quality               1000 non-null   ob

##### **We can infer that all columns except *parental_education_level* have no null values.**
##### **The *student_id* column is unnecessary for our goal of predicting exam scores based off of student habits.** 
##### **The *gender*, *part_time_job*, *diet_quality*, *parental_education_level*, *internet_quality*, and *extracurricular_participation* columns are all non-numbers and must be converted for scaling and training.**

### **3. Fill Null Values**

##### **Fill the null-values in the parental_education_level column.**

In [62]:
filler = df['parental_education_level'].mode()[0]
df['parental_education_level'] = df['parental_education_level'].fillna(filler)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   student_id                     1000 non-null   object 
 1   age                            1000 non-null   int64  
 2   gender                         1000 non-null   object 
 3   study_hours_per_day            1000 non-null   float64
 4   social_media_hours             1000 non-null   float64
 5   netflix_hours                  1000 non-null   float64
 6   part_time_job                  1000 non-null   object 
 7   attendance_percentage          1000 non-null   float64
 8   sleep_hours                    1000 non-null   float64
 9   diet_quality                   1000 non-null   object 
 10  exercise_frequency             1000 non-null   int64  
 11  parental_education_level       1000 non-null   object 
 12  internet_quality               1000 non-null   ob

### **4. Drop Unnecessary Columns**

##### **Drop the *ID* column because it is unnecessary.**

In [63]:
df.drop(columns=['student_id'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            1000 non-null   int64  
 1   gender                         1000 non-null   object 
 2   study_hours_per_day            1000 non-null   float64
 3   social_media_hours             1000 non-null   float64
 4   netflix_hours                  1000 non-null   float64
 5   part_time_job                  1000 non-null   object 
 6   attendance_percentage          1000 non-null   float64
 7   sleep_hours                    1000 non-null   float64
 8   diet_quality                   1000 non-null   object 
 9   exercise_frequency             1000 non-null   int64  
 10  parental_education_level       1000 non-null   object 
 11  internet_quality               1000 non-null   object 
 12  mental_health_rating           1000 non-null   in

### **5. Convert String Values**

##### **Inspect the columns with string values**

In [64]:
print(df['gender'].value_counts(), '\n')
print(df['part_time_job'].value_counts(), '\n')
print(df['diet_quality'].value_counts(), '\n')
print(df['parental_education_level'].value_counts(), '\n')
print(df['internet_quality'].value_counts(), '\n')
print(df['extracurricular_participation'].value_counts())

gender
Female    481
Male      477
Other      42
Name: count, dtype: int64 

part_time_job
No     785
Yes    215
Name: count, dtype: int64 

diet_quality
Fair    437
Good    378
Poor    185
Name: count, dtype: int64 

parental_education_level
High School    483
Bachelor       350
Master         167
Name: count, dtype: int64 

internet_quality
Good       447
Average    391
Poor       162
Name: count, dtype: int64 

extracurricular_participation
No     682
Yes    318
Name: count, dtype: int64


##### **Convert the string values into integers**

In [65]:
pd.set_option('future.no_silent_downcasting', True)
df['gender'] = df['gender'].replace({'Male': 0, 'Female': 1, 'Other': 2}).astype(int)
df['part_time_job'] = df['part_time_job'].replace({'No': 0, 'Yes': 1}).astype(int)
df['diet_quality'] = df['diet_quality'].replace({'Poor': 0, 'Fair': 1, 'Good': 2}).astype(int)
df['parental_education_level'] = df['parental_education_level'].replace({'High School': 0, 'Bachelor': 1, 'Master': 2}).astype(int)
df['internet_quality'] = df['internet_quality'].replace({'Poor': 0, 'Average': 1, 'Good': 2})
df['extracurricular_participation'] = df['extracurricular_participation'].replace({'No': 0, 'Yes': 1}).astype(int)

### **6. Set X (Feature Matrix) and y (Target Vector)**

In [66]:
X = np.array(df.drop(columns=['exam_score']))
y = np.array(df['exam_score'])

### **7. Split Data**

In [67]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

# **Model Experimentation**

In [125]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

## Test Ridge, SGD, and Decision Tree Regressors

In [122]:
from sklearn.linear_model import Ridge, SGDRegressor
from sklearn.tree import DecisionTreeRegressor

### Ridge Regression

In [138]:
param_grid = {"ridge__alpha": np.logspace(start=-3, stop=3, num=7, base=10.0)}

ridge = make_pipeline(StandardScaler(), Ridge(random_state=7))
search = GridSearchCV(ridge,
                      param_grid=param_grid,
                      scoring="neg_root_mean_squared_error",
                      n_jobs=-1,
                      cv=5)

search.fit(X_train, y_train)
score = search.best_score_
params = search.best_params_

In [139]:
print("Best RMSE: ", -score)

print("Best Parameters: ")
for key, value in params.items():
    print("\t" + key + " - " + str(value))

Best RMSE:  5.46038519708242
Best Parameters: 
	ridge__alpha - 1.0


### SGD Regression

In [142]:
param_grid = {"sgdregressor__alpha": np.logspace(-3, 3, 7)}

SGD_reg = make_pipeline(StandardScaler(),
                        SGDRegressor(max_iter=5_000, random_state=7))

search = GridSearchCV(SGD_reg,
                      param_grid=param_grid,
                      scoring="neg_root_mean_squared_error",
                      n_jobs=-1,
                      cv=5)

search.fit(X_train, y_train)
score = search.best_score_
params = search.best_params_

In [143]:
print("Best RMSE: ", -score)

print("Best Parameters: ")
for key, value in params.items():
    print("\t" + key + " - " + str(value))

Best RMSE:  5.462781269122381
Best Parameters: 
	sgdregressor__alpha - 0.001


### Decision Tree Regression

In [147]:
param_dist = {"max_depth": [None] + list(range(5, 31)),
              "min_samples_split": randint(2, 30),
              "min_samples_leaf": randint(1, 15),
              "max_features": [None, 1.0, 0.75, 0.5]}

dt = DecisionTreeRegressor(random_state=7)

search = RandomizedSearchCV(dt,
                            param_distributions=param_dist,
                            n_iter=30,
                            scoring="neg_root_mean_squared_error",
                            n_jobs=-1,
                            cv=5)

search.fit(X_train, y_train)
score = search.best_score_
params = search.best_params_

In [148]:
print("Best RMSE: ", -score)

print("Best Parameters: ")
for key, value in params.items():
    print("\t" + key + " - " + str(value))

Best RMSE:  8.723722793991879
Best Parameters: 
	max_depth - 11
	max_features - None
	min_samples_leaf - 11
	min_samples_split - 3


##### **Conclusion:** Regularized linear regression models' predictions are more accurate

In [None]:
## Test Voting

# **Training Model**

##### **Train using basic linear regression, ridge regression, lasso regression, and elastic net regression in a pipeline where data is scaled first**

##### **Test models**

##### **Attempt to improve regularized models by testing different alpha values**


##### **The sweet spot is alpha set to 0.001 for ridge regression**

# **Decide On Final Model**

##### **The model using linear regression has the lowest RMSE value.**
##### **Therefore, the final model will use linear regression**

# **Save Model For Further Use**

In [None]:
from joblib import dump

dump(final_pipe, '../predictor_script/pipe.joblib')