In [122]:
### Run this cell before continuing.
import altair as alt
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.model_selection import GridSearchCV, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Simplify working with large datasets in Altair
alt.data_transformers.disable_max_rows()

# Output dataframes instead of arrays
set_config(transform_output="pandas")

In [123]:
url = "https://drive.usercontent.google.com/download?id=1Px4pE2Xf1TEGfYV3ChaoRRRS0YZbKbX_&export=download&authuser=0&confirm=t&uuid=7c9d6e2b-f34f-423d-ad4f-f386faaa47d4&at=APZUnTUUdkEofob3B1bEEFJ0HcHq:1698615817259"


data_training = pd.read_excel(url, sheet_name = "Training_Data")
data_testing = pd.read_excel(url, sheet_name = "Test_Data")
data = pd.concat([data_training, data_testing])

In [124]:
data = data_training.drop(columns=["Attribute Information:", "Unnamed: 6", "Unnamed: 7"])

In [125]:
data = data.rename(
columns = {
    "STG" : "Study Time",
    "SCG" : "Repetition Time",
    "STR" : "Study Time for Related Objects",
    "LPR" : "Exam Performance for Related Objects",
    "PEG" : "Exam Performance",
    " UNS" : "Knowledge Level"
    }
)

In [126]:
data_training, data_testing = train_test_split(
    data,
    test_size = 0.25,
    random_state = 111
)

In [127]:
data_training

Unnamed: 0,Study Time,Repetition Time,Study Time for Related Objects,Exam Performance for Related Objects,Exam Performance,Knowledge Level
155,0.440,0.320,0.55,0.33,0.52,Middle
240,0.500,0.400,0.73,0.62,0.20,Low
209,0.850,0.050,0.91,0.80,0.68,High
156,0.450,0.299,0.63,0.36,0.51,Middle
111,0.258,0.310,0.88,0.40,0.30,Low
...,...,...,...,...,...,...
66,0.320,0.200,0.06,0.26,0.24,very_low
169,0.410,0.490,0.34,0.21,0.92,High
19,0.200,0.290,0.25,0.49,0.56,Middle
86,0.325,0.250,0.38,0.31,0.79,High


In [128]:
data_testing

Unnamed: 0,Study Time,Repetition Time,Study Time for Related Objects,Exam Performance for Related Objects,Exam Performance,Knowledge Level
39,0.190,0.380,0.38,0.49,0.45,Middle
108,0.320,0.270,0.52,0.81,0.30,Middle
55,0.100,0.600,0.33,0.42,0.26,Low
2,0.060,0.060,0.05,0.25,0.33,Low
63,0.200,0.680,0.73,0.48,0.28,Low
...,...,...,...,...,...,...
92,0.251,0.265,0.57,0.60,0.09,very_low
6,0.100,0.100,0.43,0.29,0.56,Middle
56,0.200,0.520,0.36,0.84,0.25,Middle
44,0.115,0.350,0.65,0.27,0.04,very_low


In [129]:
alt.Chart(data_training).mark_point().encode(
x = "Study Time",
y = "Exam Performance",
color = "Knowledge Level"
)

In [130]:
data_training.groupby("Knowledge Level").agg(["mean", "std"])

Unnamed: 0_level_0,Study Time,Study Time,Repetition Time,Repetition Time,Study Time for Related Objects,Study Time for Related Objects,Exam Performance for Related Objects,Exam Performance for Related Objects,Exam Performance,Exam Performance
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
Knowledge Level,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
High,0.43387,0.24609,0.398283,0.208605,0.499239,0.270567,0.516957,0.276935,0.764348,0.105549
Low,0.328129,0.169657,0.314548,0.177808,0.439032,0.245081,0.512419,0.238538,0.236016,0.075103
Middle,0.404536,0.212384,0.377957,0.212243,0.515942,0.218849,0.334058,0.221295,0.551304,0.12324
very_low,0.343313,0.200094,0.184375,0.147737,0.3625,0.229449,0.340625,0.197601,0.095,0.06121


In [131]:
data_training["Knowledge Level"].value_counts()

Middle      69
Low         62
High        46
very_low    16
Name: Knowledge Level, dtype: int64