In [158]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [159]:
df = pd.read_csv("/kaggle/input/students-performance-in-exams/StudentsPerformance.csv")

#### Exploratory Data Analysis

In [160]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [161]:
df.shape

(1000, 8)

In [162]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [163]:
df['parental level of education'].unique()

array(["bachelor's degree", 'some college', "master's degree",
       "associate's degree", 'high school', 'some high school'],
      dtype=object)

## Feature Engineering

In [164]:
from sklearn.preprocessing import OneHotEncoder

In [165]:
education_map = {
     "some high school":1,
     "high school":2,
     "some college":3,
     "associate's degree":4,
     "bachelor's degree":5,
     "master's degree":6
}

df['parent_edu_level'] = df['parental level of education'].map(education_map)

In [166]:
df['gender'] = df['gender'].map({'female' : 0, 'male': 1})

In [167]:
df.drop(['parental level of education'], axis=1, inplace=True)
df = df.drop(columns=['reading score', 'writing score'])

##### Feature Engg for LinearRegression

In [168]:
one_hot_encoder = OneHotEncoder(sparse_output=False)

In [169]:
cat_cols = ['race/ethnicity', 'lunch', 'test preparation course']

encoded_data = one_hot_encoder.fit_transform(df[cat_cols])

In [170]:
df_without_cat = df.drop(columns=cat_cols)

df_without_cat.head()

Unnamed: 0,gender,math score,parent_edu_level
0,0,72,5
1,0,69,3
2,0,90,6
3,1,47,4
4,1,76,3


In [171]:
one_hot_encoder.get_feature_names_out()

array(['race/ethnicity_group A', 'race/ethnicity_group B',
       'race/ethnicity_group C', 'race/ethnicity_group D',
       'race/ethnicity_group E', 'lunch_free/reduced', 'lunch_standard',
       'test preparation course_completed',
       'test preparation course_none'], dtype=object)

In [172]:
encoded_df = pd.DataFrame(encoded_data, columns=one_hot_encoder.get_feature_names_out())

In [173]:
linear_df = pd.concat([df_without_cat.reset_index(drop=True), encoded_df], axis=1)

linear_df.head()

Unnamed: 0,gender,math score,parent_edu_level,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,lunch_free/reduced,lunch_standard,test preparation course_completed,test preparation course_none
0,0,72,5,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,0,69,3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
2,0,90,6,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,1,47,4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,1,76,3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


##### Feature Engg for XGBoost & RandomForestRegressor

In [174]:
from sklearn.preprocessing import LabelEncoder

In [175]:
encoder = LabelEncoder()

In [176]:
df['race/ethnicity'] = encoder.fit_transform(df['race/ethnicity'])
df['lunch'] = encoder.fit_transform(df['lunch'])
df['test preparation course'] = encoder.fit_transform(df['test preparation course'])

In [177]:
df.head()

Unnamed: 0,gender,race/ethnicity,lunch,test preparation course,math score,parent_edu_level
0,0,1,1,1,72,5
1,0,2,1,0,69,3
2,0,1,1,1,90,6
3,1,0,0,1,47,4
4,1,2,1,1,76,3


### Test Data Split

In [178]:
from sklearn.model_selection import train_test_split

In [179]:
X_linear = linear_df.drop(['math score'], axis=1)
Y_linear = linear_df['math score']

In [180]:
x_train_linear, x_test_linear, y_train_linear, y_test_linear = train_test_split(X_linear, Y_linear, test_size=0.2, random_state=42)

In [181]:
X = df.drop(['math score'], axis=1)
Y = df['math score']

In [182]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

#### Model Training & Predictions

In [183]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import xgboost as xgb

In [184]:
models = {
    'LinearRegression': LinearRegression(),
    'RandomForestRegressor' : RandomForestRegressor(random_state=42),
    'XGBoostRegressor': xgb.XGBRegressor(random_state=42)
}

def evaluate_models(x_train, x_test, y_train, y_test):
    results = []
    for name, model in models.items():
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        score = r2_score(y_test, y_pred)
        results.append((name, score))
    results.sort(key=lambda x: x[1], reverse=True)
    return results

In [185]:
results = evaluate_models(x_train,x_test,y_train,y_test)
    
print("Model Performance (without One Hot Encoding):")
for name, acc in results:
    print(f"{name}: {acc:.6f}")

Model Performance (without One Hot Encoding):
LinearRegression: 0.210425
RandomForestRegressor: -0.009825
XGBoostRegressor: -0.060474


In [186]:
results = evaluate_models(x_train_linear,x_test_linear,y_train_linear,y_test_linear)
    
print("Model Performance (with One Hot Encoding):")
for name, acc in results:
    print(f"{name}: {acc:.6f}")

Model Performance (with One Hot Encoding):
LinearRegression: 0.188486
RandomForestRegressor: -0.009507
XGBoostRegressor: -0.056487
