In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# **Pre-Processing**

### **1. Load and Read Data**

In [None]:
df = pd.read_csv('../data/student_habits_performance.csv')

### **2. Inspect Data**

In [None]:
df.info()

##### **We can infer that all columns except *parental_education_level* have no null values.**
##### **The *student_id* column is unnecessary for our goal of predicting exam scores based off of student habits.** 
##### **The *gender*, *part_time_job*, *diet_quality*, *parental_education_level*, *internet_quality*, and *extracurricular_participation* columns are all non-numbers and must be converted for scaling and training.**

### **3. Fill Null Values**

##### **Fill the null-values in the parental_education_level column.**

In [None]:
filler = df['parental_education_level'].mode()[0]
df['parental_education_level'] = df['parental_education_level'].fillna(filler)
df.info()

### **4. Drop Unnecessary Columns**

##### **Drop the *ID* column because it is unnecessary.**

In [None]:
df.drop(columns=['student_id'], inplace=True)
df.info()

### **5. Convert String Values**

##### **Inspect the columns with string values**

In [None]:
print(df['gender'].value_counts(), '\n')
print(df['part_time_job'].value_counts(), '\n')
print(df['diet_quality'].value_counts(), '\n')
print(df['parental_education_level'].value_counts(), '\n')
print(df['internet_quality'].value_counts(), '\n')
print(df['extracurricular_participation'].value_counts())

##### **Convert the string values into integers**

In [None]:
pd.set_option('future.no_silent_downcasting', True)
df['gender'] = df['gender'].replace({'Male': 0, 'Female': 1, 'Other': 2}).astype(int)
df['part_time_job'] = df['part_time_job'].replace({'No': 0, 'Yes': 1}).astype(int)
df['diet_quality'] = df['diet_quality'].replace({'Poor': 0, 'Fair': 1, 'Good': 2}).astype(int)
df['parental_education_level'] = df['parental_education_level'].replace({'High School': 0, 'Bachelor': 1, 'Master': 2}).astype(int)
df['internet_quality'] = df['internet_quality'].replace({'Poor': 0, 'Average': 1, 'Good': 2})
df['extracurricular_participation'] = df['extracurricular_participation'].replace({'No': 0, 'Yes': 1}).astype(int)

### **6. Set X (Feature Matrix) and y (Target Vector)**

In [None]:
X = np.array(df.drop(columns=['exam_score']))
y = np.array(df['exam_score'])

### **7. Split Data**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

### **8. Scale data**

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)