In [15]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/masterycalc/masterycalc_dataset.csv
/kaggle/input/mastery/masterynew_questions (1).csv


## Data preprocessing

In [16]:
dataset = pd.read_csv('/kaggle/input/mastery/masterynew_questions (1).csv')
dataset

Unnamed: 0,Student ID,Age,Topic,Difficulty,Accuracy (%),Previous Mastery Score,New Mastery Score,Number of Questions
0,4,6,Letter Reading,1,100.00,0.40,0.53,5
1,89,7,Letter Reading,1,64.97,0.60,0.55,2
2,20,6,Letter Reading,3,36.58,0.46,0.46,4
3,53,6,Letter Reading,1,100.00,0.41,0.48,3
4,73,6,Letter Reading,1,53.16,0.57,0.42,5
...,...,...,...,...,...,...,...,...
490,91,10,Noun Classification,2,70.23,0.91,0.85,4
491,39,11,Noun Classification,3,46.19,0.85,0.85,1
492,62,11,Noun Classification,3,64.41,0.87,0.87,4
493,98,10,Noun Classification,3,36.66,0.88,0.88,5


In [17]:
dataset['Accuracy (%)'] /= 100

In [18]:
dataset['Topic'].unique()

array(['Letter Reading', 'Word Reading', 'Short Passage Reading',
       'Passage Comprehension', 'Noun Classification'], dtype=object)

In [20]:
read_type_map = {}

for index, topic in enumerate(dataset['Topic'].unique()):
    read_type_map[topic] = f'R{index + 1}'
    
read_type_map

{'Letter Reading': 'R1',
 'Word Reading': 'R2',
 'Short Passage Reading': 'R3',
 'Passage Comprehension': 'R4',
 'Noun Classification': 'R5'}

In [19]:
dataset = dataset.rename(columns={'Accuracy (%)': 'Accuracy'})
dataset

Unnamed: 0,Student ID,Age,Topic,Difficulty,Accuracy,Previous Mastery Score,New Mastery Score,Number of Questions
0,4,6,Letter Reading,1,1.0000,0.40,0.53,5
1,89,7,Letter Reading,1,0.6497,0.60,0.55,2
2,20,6,Letter Reading,3,0.3658,0.46,0.46,4
3,53,6,Letter Reading,1,1.0000,0.41,0.48,3
4,73,6,Letter Reading,1,0.5316,0.57,0.42,5
...,...,...,...,...,...,...,...,...
490,91,10,Noun Classification,2,0.7023,0.91,0.85,4
491,39,11,Noun Classification,3,0.4619,0.85,0.85,1
492,62,11,Noun Classification,3,0.6441,0.87,0.87,4
493,98,10,Noun Classification,3,0.3666,0.88,0.88,5


In [21]:
dataset['Topic'] = dataset['Topic'].map(read_type_map)

In [22]:
dataset

Unnamed: 0,Student ID,Age,Topic,Difficulty,Accuracy,Previous Mastery Score,New Mastery Score,Number of Questions
0,4,6,R1,1,1.0000,0.40,0.53,5
1,89,7,R1,1,0.6497,0.60,0.55,2
2,20,6,R1,3,0.3658,0.46,0.46,4
3,53,6,R1,1,1.0000,0.41,0.48,3
4,73,6,R1,1,0.5316,0.57,0.42,5
...,...,...,...,...,...,...,...,...
490,91,10,R5,2,0.7023,0.91,0.85,4
491,39,11,R5,3,0.4619,0.85,0.85,1
492,62,11,R5,3,0.6441,0.87,0.87,4
493,98,10,R5,3,0.3666,0.88,0.88,5


In [24]:
dataset.columns

Index(['Student ID', 'Age', 'Topic', 'Difficulty', 'Accuracy',
       'Previous Mastery Score', 'New Mastery Score', 'Number of Questions'],
      dtype='object')

In [50]:
X = dataset[['Age', 'Topic', 'Difficulty', 'Accuracy', 'Number of Questions',
       'Previous Mastery Score']]

y = dataset['New Mastery Score']

In [48]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [51]:
ct = ColumnTransformer(
    transformers=[
        ('topic', OneHotEncoder(sparse_output=False), ['Topic'])
    ],
    remainder='passthrough'  # Keep other columns unchanged
)

# Transform the dataset
X_transformed = pd.DataFrame(ct.fit_transform(X), columns=ct.get_feature_names_out(), index=X.index)

## Model selection

In [53]:
from sklearn.model_selection import train_test_split

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

In [58]:
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.linear_model import TweedieRegressor

In [59]:
def evaluate_model(y_test, y_pred):
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    return rmse, r2

In [71]:
def summarize_metrics(metrics_dict):
    metrics_df = pd.DataFrame(metrics_dict)
    return metrics_df

In [60]:
def xgboost_regressor(X_train, X_test, y_train, y_test, metrics_dict):
    model = XGBRegressor()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse, r2 = evaluate_model(y_test, y_pred)
    metrics_dict['XGBoost'] = [rmse, r2]

In [61]:
def lightgbm_regressor(X_train, X_test, y_train, y_test, metrics_dict):
    model = lgb.LGBMRegressor()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse, r2 = evaluate_model(y_test, y_pred)
    metrics_dict['LightGBM'] = [rmse, r2]

In [62]:
def catboost_regressor(X_train, X_test, y_train, y_test, metrics_dict):
    model = CatBoostRegressor(verbose=0)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse, r2 = evaluate_model(y_test, y_pred)
    metrics_dict['CatBoost'] = [rmse, r2]

In [63]:
def neural_network_regressor(X_train, X_test, y_train, y_test, metrics_dict):
    model = Sequential()
    model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mean_squared_error')

    model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)
    y_pred = model.predict(X_test).flatten()
    rmse, r2 = evaluate_model(y_test, y_pred)
    metrics_dict['Neural Network'] = [rmse, r2]

In [64]:
def stacking_regressor(X_train, X_test, y_train, y_test, metrics_dict):
    base_learners = [
        ('xgb', XGBRegressor()),
        ('lgb', lgb.LGBMRegressor()),
        ('catboost', CatBoostRegressor(verbose=0))
    ]
    model = StackingRegressor(estimators=base_learners, final_estimator=ElasticNet())
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse, r2 = evaluate_model(y_test, y_pred)
    metrics_dict['Stacking'] = [rmse, r2]

In [65]:
def tweedie_regressor(X_train, X_test, y_train, y_test, metrics_dict):
    model = TweedieRegressor(power=1.5, alpha=0.5)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse, r2 = evaluate_model(y_test, y_pred)
    metrics_dict['Tweedie'] = [rmse, r2]

In [66]:
def elasticnet_regressor(X_train, X_test, y_train, y_test, metrics_dict):
    model = ElasticNet(alpha=1.0, l1_ratio=0.5)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse, r2 = evaluate_model(y_test, y_pred)
    metrics_dict['ElasticNet'] = [rmse, r2]

In [69]:
def run_models():
    metrics_dict = {
        'Model': ['RMSE', 'R^2 Score'],
    }

#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    xgboost_regressor(X_train, X_test, y_train, y_test, metrics_dict)
    lightgbm_regressor(X_train, X_test, y_train, y_test, metrics_dict)
    catboost_regressor(X_train, X_test, y_train, y_test, metrics_dict)
    neural_network_regressor(X_train, X_test, y_train, y_test, metrics_dict)
    stacking_regressor(X_train, X_test, y_train, y_test, metrics_dict)
    tweedie_regressor(X_train, X_test, y_train, y_test, metrics_dict)
    elasticnet_regressor(X_train, X_test, y_train, y_test, metrics_dict)

    # Convert metrics_dict to DataFrame
    metrics_df = summarize_metrics(metrics_dict)
    metrics_df.set_index('Model', inplace=True)
    return metrics_df

In [72]:
metrics_df = run_models()
metrics_df

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000225 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 217
[LightGBM] [Info] Number of data points in the train set: 396, number of used features: 10
[LightGBM] [Info] Start training from score 0.696389


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000115 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 217
[LightGBM] [Info] Number of data points in the train set: 396, number of used features: 10
[LightGBM] [Info] Start training from score 0.696389
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000104 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 316, number of used features: 10
[LightGBM] [Info] Start training from score 0.695032
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000083 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 187
[LightGBM] [Info] Number of data points in the train se

Unnamed: 0_level_0,XGBoost,LightGBM,CatBoost,Neural Network,Stacking,Tweedie,ElasticNet
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
RMSE,0.047311,0.038765,0.029656,0.092812,0.193267,0.129597,0.193267
R^2 Score,0.939932,0.959672,0.976399,0.768834,-0.002381,0.549284,-0.002381
