In [1]:
# Importing the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# To ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Reading the csv file and putting it into 'df' object.
df = pd.read_csv('study_performance.csv')
df.head(10)

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
5,female,group B,associate's degree,standard,none,71,83,78
6,female,group B,some college,standard,completed,88,95,92
7,male,group B,some college,free/reduced,none,40,43,39
8,male,group D,high school,free/reduced,completed,64,64,67
9,female,group B,high school,free/reduced,none,38,60,50


In [3]:
# Remove the columns that are not relevant to our goal
df = df.drop(['gender', 'race_ethnicity', 'lunch'], axis=1)

In [4]:
# Let's understand the type of columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   parental_level_of_education  1000 non-null   object
 1   test_preparation_course      1000 non-null   object
 2   math_score                   1000 non-null   int64 
 3   reading_score                1000 non-null   int64 
 4   writing_score                1000 non-null   int64 
dtypes: int64(3), object(2)
memory usage: 39.2+ KB


In [5]:
# Show the revisited table
df.head(10)

Unnamed: 0,parental_level_of_education,test_preparation_course,math_score,reading_score,writing_score
0,bachelor's degree,none,72,72,74
1,some college,completed,69,90,88
2,master's degree,none,90,95,93
3,associate's degree,none,47,57,44
4,some college,none,76,78,75
5,associate's degree,none,71,83,78
6,some college,completed,88,95,92
7,some college,none,40,43,39
8,high school,completed,64,64,67
9,high school,none,38,60,50


In [6]:
# The feature "test_preparation_course" contains string values ('completed' or 'none'), which the Random Forest model can't handle
# The model expects only numeric inputs, so we need to create dummies or in this case, I prefer to assing '1' to 'completed' and '0' to 'none':
df['test_preparation_course'] = df['test_preparation_course'].map({'completed': 1, 'none': 0})

In [7]:
# Show the revisited table
df.head(10)

Unnamed: 0,parental_level_of_education,test_preparation_course,math_score,reading_score,writing_score
0,bachelor's degree,0,72,72,74
1,some college,1,69,90,88
2,master's degree,0,90,95,93
3,associate's degree,0,47,57,44
4,some college,0,76,78,75
5,associate's degree,0,71,83,78
6,some college,1,88,95,92
7,some college,0,40,43,39
8,high school,1,64,64,67
9,high school,0,38,60,50


In this case, we know that there are no major data quality issues, so we'll go ahead and build the model.

### Data Preparation and Model Building

In [8]:
# Importing test_train_split from sklearn library
from sklearn.model_selection import train_test_split

In [9]:
# Putting feature variable to X
X = df.drop('parental_level_of_education',axis=1)

# Putting response variable to y
y = df['parental_level_of_education']

# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=16)

### Default Hyperparameters
Let's first fit a random forest model with default hyperparameters

In [10]:
# Importing random forest classifier from sklearn library
from sklearn.ensemble import RandomForestClassifier

# Running the random forest with default parameters.
rfc = RandomForestClassifier()

In [11]:
# fit
rfc.fit(X_train,y_train)

In [12]:
# Making predictions
predictions = rfc.predict(X_test)

In [13]:
# Importing classification report and confusion matrix from sklearn metrics
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score

In [14]:
# Let's check the report of our default model
print(classification_report(y_test,predictions))

                    precision    recall  f1-score   support

associate's degree       0.21      0.25      0.23        67
 bachelor's degree       0.12      0.14      0.13        29
       high school       0.28      0.30      0.29        54
   master's degree       0.00      0.00      0.00        19
      some college       0.27      0.32      0.29        71
  some high school       0.28      0.17      0.21        60

          accuracy                           0.23       300
         macro avg       0.19      0.20      0.19       300
      weighted avg       0.23      0.23      0.23       300



Since the values are not that good, I will try to improve them by reducing the 6 classes to only 2:
- Group 1: degrees
- Group 2: school and college

In [15]:
# Reducing the classes from 6 to only 2:
mapping = {
    "associate's degree": 1,  # Group 1
    "bachelor's degree": 1,   # Group 1
    "master's degree": 1,     # Group 1
    "high school": 0,         # Group 2
    "some college": 0,        # Group 2
    "some high school": 0     # Group 2
}

# Apply the mapping to create the new binary target variable
df['level_of_education'] = df['parental_level_of_education'].map(mapping)
# Dropping the original column
df = df.drop(['parental_level_of_education'], axis=1)

In [16]:
# Show the revisited table
df.head(10)

Unnamed: 0,test_preparation_course,math_score,reading_score,writing_score,level_of_education
0,0,72,72,74,1
1,1,69,90,88,0
2,0,90,95,93,1
3,0,47,57,44,1
4,0,76,78,75,0
5,0,71,83,78,1
6,1,88,95,92,0
7,0,40,43,39,0
8,1,64,64,67,0
9,0,38,60,50,0


In [17]:
# Putting feature variable to X
X = df.drop('level_of_education',axis=1)

# Putting response variable to y
y = df['level_of_education']

# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=16)

In [18]:
# Running the random forest with default parameters.
rfc = RandomForestClassifier()

In [19]:
# fit
rfc.fit(X_train,y_train)

In [20]:
# Making predictions
predictions = rfc.predict(X_test)

In [21]:
# Let's check the report of our default model
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.64      0.63      0.64       185
           1       0.42      0.43      0.42       115

    accuracy                           0.55       300
   macro avg       0.53      0.53      0.53       300
weighted avg       0.55      0.55      0.55       300



Apparently, the idea of grouping was useful as the values make now more sense