In [162]:
# Run this cell before continuing.
import altair as alt
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.model_selection import GridSearchCV, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Simplify working with large datasets in Altair
alt.data_transformers.disable_max_rows()

# Output dataframes instead of arrays
set_config(transform_output="pandas")
np.random.seed(1)

In [163]:
url = "https://drive.usercontent.google.com/download?id=1Px4pE2Xf1TEGfYV3ChaoRRRS0YZbKbX_&export=download&authuser=0&confirm=t&uuid=7c9d6e2b-f34f-423d-ad4f-f386faaa47d4&at=APZUnTUUdkEofob3B1bEEFJ0HcHq:1698615817259"

# import from two sheets and combine into one dataframe
data_training_sheet = pd.read_excel(url, sheet_name="Training_Data")
data_testing_sheet = pd.read_excel(url, sheet_name="Test_Data")
data = pd.concat([data_training_sheet, data_testing_sheet])

In [164]:
data = data.drop(
    columns=["Attribute Information:", "Unnamed: 6", "Unnamed: 7"]
)
data


Unnamed: 0,STG,SCG,STR,LPR,PEG,UNS
0,0.00,0.00,0.00,0.00,0.00,very_low
1,0.08,0.08,0.10,0.24,0.90,High
2,0.06,0.06,0.05,0.25,0.33,Low
3,0.10,0.10,0.15,0.65,0.30,Middle
4,0.08,0.08,0.08,0.98,0.24,Low
...,...,...,...,...,...,...
140,0.90,0.78,0.62,0.32,0.89,High
141,0.85,0.82,0.66,0.83,0.83,High
142,0.56,0.60,0.77,0.13,0.32,Low
143,0.66,0.68,0.81,0.57,0.57,Middle


In [165]:
# rename columns to make them more readable
data = data.rename(
    columns={
        "STG": "Study Time",
        "SCG": "Repetition Time",
        "STR": "Study Time for Related Objects",
        "LPR": "Exam Performance for Related Objects",
        "PEG": "Exam Performance",
        " UNS": "Knowledge Level"
    }
)

In [166]:
# split data into training and testing sets
data_training, data_testing = train_test_split(
    data,
    test_size=0.25,
    random_state=111
)

In [167]:
data_training

Unnamed: 0,Study Time,Repetition Time,Study Time for Related Objects,Exam Performance for Related Objects,Exam Performance,Knowledge Level
130,0.390,0.05,0.02,0.06,0.34,Low
149,0.480,0.30,0.15,0.65,0.77,High
180,0.365,0.68,0.10,0.63,0.18,Low
46,0.100,0.39,0.75,0.31,0.62,Middle
237,0.660,0.36,0.56,0.40,0.83,High
...,...,...,...,...,...,...
17,0.000,0.25,0.50,0.09,0.07,Very Low
86,0.325,0.25,0.38,0.31,0.79,High
212,0.900,0.26,0.19,0.58,0.79,High
106,0.420,0.29,0.14,0.03,0.68,Middle


In [168]:
data_testing

Unnamed: 0,Study Time,Repetition Time,Study Time for Related Objects,Exam Performance for Related Objects,Exam Performance,Knowledge Level
137,0.640,0.580,0.14,0.32,0.21,Low
253,0.610,0.780,0.69,0.92,0.58,High
105,0.380,0.090,0.37,0.28,0.32,Low
154,0.430,0.305,0.51,0.09,0.64,Middle
176,0.390,0.420,0.83,0.65,0.19,Low
...,...,...,...,...,...,...
10,0.180,0.180,0.55,0.30,0.81,High
115,0.285,0.640,0.18,0.61,0.45,Middle
183,0.370,0.550,0.41,0.29,0.30,Low
189,0.490,0.900,0.52,0.90,0.47,High


In [169]:
# create a scatterplot of the data to visualize the relationship 
# between study time and exam performance and the knowledge level of the student
alt.Chart(data_training).mark_point().encode(
    x="Study Time",
    y="Exam Performance",
    color="Knowledge Level"
)


In [170]:
# create a table to show the mean and standard deviation of each level of knowledge
data_training.groupby("Knowledge Level").agg(["mean", "std"])

Unnamed: 0_level_0,Study Time,Study Time,Repetition Time,Repetition Time,Study Time for Related Objects,Study Time for Related Objects,Exam Performance for Related Objects,Exam Performance for Related Objects,Exam Performance,Exam Performance
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
Knowledge Level,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
High,0.400293,0.237109,0.428413,0.241783,0.534333,0.256321,0.531467,0.273727,0.804667,0.108271
Low,0.325896,0.18385,0.324354,0.182445,0.409375,0.252006,0.464375,0.230555,0.24875,0.073588
Middle,0.374656,0.208222,0.373215,0.211954,0.492581,0.234613,0.384516,0.248861,0.529785,0.133904
Very Low,0.224091,0.165462,0.321364,0.181194,0.293182,0.199365,0.185045,0.137006,0.100909,0.059674
very_low,0.321813,0.195123,0.204063,0.144167,0.395625,0.194695,0.37125,0.198926,0.085625,0.062072


In [171]:
# show the number of students in each level of knowledge
data_training["Knowledge Level"].value_counts()

Low         96
Middle      93
High        75
Very Low    22
very_low    16
Name: Knowledge Level, dtype: int64