# Linking Writing processes to writing quality

# Import the necessary packages needed for this analysis

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression,Lasso
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler
from catboost import Pool, cv

In [11]:
train_log=pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv')
train_log.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1


In [12]:
train_scores=pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv')
train_scores.head()

Unnamed: 0,id,score
0,001519c8,3.5
1,0022f953,3.5
2,0042269b,6.0
3,0059420b,2.0
4,0075873a,4.0


In [13]:
# lets read in the test data
test=pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv')
test.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,0000aaaa,1,338433,338518,85,Input,Space,Space,,0,0
1,0000aaaa,2,760073,760160,87,Input,Space,Space,,1,0
2,2222bbbb,1,711956,712023,67,Input,q,q,q,0,1
3,2222bbbb,2,290502,290548,46,Input,q,q,q,1,1
4,4444cccc,1,635547,635641,94,Input,Space,Space,,0,0


In [15]:
train_log.columns==test.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

In [16]:
train_log.shape, test.shape

((8405898, 11), (6, 11))

In the train dataset, we observed that there were columns  that had a less frequency of occuring. here we remove them. 

In [20]:

def remove_less_frequent(column='down_event'):
    value_counts = train_log[column].value_counts()
    values_to_keep = value_counts[value_counts > 1].index
    filtered_df = train_log[train_log[column].isin(values_to_keep)]
    return filtered_df


In [17]:
train_log=remove_less_frequent(column='down_event')

In [18]:
train_log=remove_less_frequent(column='up_event')

In [19]:
train_log=remove_less_frequent(column='activity')

 What I note is that the score is the same. That means that the score is for the final paper. The other variables keep changing.  Lets confirm this by looking at the second student 
 
 In light of this, we have to aggregate the data in terms of  each student. This will make it easy in terms  of the  final predictions. 

# Aggregating the data

## lets start with aggregating the numerical data

In [21]:
# create a function that can be applied also to the test dataset

def aggregation(data):
    numeric = data.select_dtypes('int64')
    numeric['id'] = data['id']
    means = numeric.groupby('id').mean()
    stds = numeric.groupby('id').std()
    medians = numeric.groupby('id').median()
    mins = numeric.groupby('id').min()
    maxs = numeric.groupby('id').max()
    means.columns = [col + '_means' for col in means.columns]
    stds.columns = [col + '_stds' for col in stds.columns]
    medians.columns = [col + '_medians' for col in medians.columns]
    mins.columns = [col + '_mins' for col in mins.columns]
    maxs.columns = [col + '_maxs' for col in maxs.columns]
    result = pd.concat([means, stds, medians, mins, maxs], axis=1)
    return result

    

In [22]:




def character_aggregation(data):
    character_df = data.select_dtypes('object')
    character_df['id'] = data['id']
    result=character_df.groupby('id').nunique()
    return result

In [23]:
# now we creatfe a final function
def feature_extraction(data):
    numeric_df_clean= aggregation(data)
    character_df_clean=character_aggregation(data)
    result = numeric_df_clean.merge(character_df_clean, on='id', how='inner')
    return  result
train=feature_extraction(train_log)
    

In [24]:
train.head()

Unnamed: 0_level_0,event_id_means,down_time_means,up_time_means,action_time_means,cursor_position_means,word_count_means,event_id_stds,down_time_stds,up_time_stds,action_time_stds,...,event_id_maxs,down_time_maxs,up_time_maxs,action_time_maxs,cursor_position_maxs,word_count_maxs,activity,down_event,up_event,text_change
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
001519c8,1277.667972,847415.811668,847532.194988,116.38332,711.590446,127.975333,737.691363,394671.889949,394665.119284,91.764711,...,2557,1801877,1801969,2259,1539,256,4,12,12,15
0022f953,1227.5,518855.347596,518967.568867,112.221271,776.205786,182.714751,708.553103,384959.404177,384952.728796,55.431189,...,2454,1788842,1788969,1758,1676,323,5,17,17,12
0042269b,2068.5,828491.775145,828593.612911,101.837766,731.611702,194.772727,1194.104686,489500.796565,489500.438784,82.383766,...,4136,1771219,1771669,3005,2291,404,4,13,18,19
0059420b,778.5,785483.02635,785604.874679,121.848329,542.537275,103.618895,449.322824,385205.014399,385206.08125,113.768226,...,1556,1404394,1404469,806,1047,206,5,15,15,10
0075873a,1266.0,713354.19755,713478.141446,123.943896,600.050968,125.082971,730.781089,405576.409034,405575.631746,62.082013,...,2531,1662390,1662472,701,1402,252,3,11,11,9


In [25]:
y=train_scores['score']
X=train

In [26]:
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.2)

# Fitting a Stacking Regressor with catboost

In [27]:


# Define the base models
base_models = [
    ('catboost', CatBoostRegressor(depth=6, iterations=500, learning_rate=0.01)),
    ('cat',CatBoostRegressor(depth=6, iterations=500, learning_rate=0.01)),
    ('RF',RandomForestRegressor(n_estimators=500)),
    ('xgb',XGBRegressor(n_estimators=200)),
    ('xgb2',XGBRegressor()),
    ('cat2',CatBoostRegressor())
]
# Define the final estimator
final_estimator = SVR()  
# Create the stacking regressor
stacking_regressor = StackingRegressor(
    estimators=base_models,
    final_estimator=final_estimator
)
# Fit the stacking regressor with your training data
stacking_regressor.fit(X_train, y_train)
# Make predictions and evaluate the model
y_pred = stacking_regressor.predict(X_test )

0:	learn: 1.0128939	total: 56.5ms	remaining: 28.2s
1:	learn: 1.0083185	total: 60.2ms	remaining: 15s
2:	learn: 1.0037322	total: 64.2ms	remaining: 10.6s
3:	learn: 0.9992673	total: 67.6ms	remaining: 8.38s
4:	learn: 0.9950444	total: 70.9ms	remaining: 7.02s
5:	learn: 0.9907208	total: 74.3ms	remaining: 6.12s
6:	learn: 0.9865384	total: 77.7ms	remaining: 5.47s
7:	learn: 0.9820930	total: 81.1ms	remaining: 4.99s
8:	learn: 0.9776470	total: 85.1ms	remaining: 4.64s
9:	learn: 0.9734448	total: 88.3ms	remaining: 4.33s
10:	learn: 0.9692400	total: 91.5ms	remaining: 4.07s
11:	learn: 0.9650223	total: 95ms	remaining: 3.86s
12:	learn: 0.9610556	total: 98.9ms	remaining: 3.7s
13:	learn: 0.9570380	total: 102ms	remaining: 3.56s
14:	learn: 0.9530689	total: 106ms	remaining: 3.42s
15:	learn: 0.9490119	total: 109ms	remaining: 3.3s
16:	learn: 0.9450437	total: 112ms	remaining: 3.19s
17:	learn: 0.9413543	total: 116ms	remaining: 3.1s
18:	learn: 0.9375239	total: 119ms	remaining: 3.01s
19:	learn: 0.9339516	total: 122ms	r

In [None]:
from sklearn.ensemble import BaggingRegressor 
# Create a CatBoostRegressor as the base model
base_regressor = CatBoostRegressor(depth=6, iterations=500, learning_rate=0.01)
# Create a Bagging Regressor that uses the CatBoostRegressor as the base model
bagging_regressor = BaggingRegressor(
    base_regressor,
    n_estimators=100,  # Number of base estimators
    random_state=42  # Random seed for reproducibility
)
# Fit the Bagging Regressor to the training data
bagging_regressor.fit(X_train, y_train)
# Make predictions on the test data
y_pred = bagging_regressor.predict(X_test)


0:	learn: 1.0389628	total: 59.8ms	remaining: 29.8s
1:	learn: 1.0341357	total: 66.7ms	remaining: 16.6s
2:	learn: 1.0293817	total: 73.2ms	remaining: 12.1s
3:	learn: 1.0249355	total: 79.8ms	remaining: 9.9s
4:	learn: 1.0205739	total: 86.5ms	remaining: 8.56s
5:	learn: 1.0161594	total: 93ms	remaining: 7.66s
6:	learn: 1.0113479	total: 99.3ms	remaining: 7s
7:	learn: 1.0067061	total: 106ms	remaining: 6.52s
8:	learn: 1.0022698	total: 112ms	remaining: 6.12s
9:	learn: 0.9976701	total: 119ms	remaining: 5.81s
10:	learn: 0.9935661	total: 125ms	remaining: 5.57s
11:	learn: 0.9891034	total: 132ms	remaining: 5.36s
12:	learn: 0.9850375	total: 138ms	remaining: 5.17s
13:	learn: 0.9807460	total: 145ms	remaining: 5.03s
14:	learn: 0.9766282	total: 151ms	remaining: 4.89s
15:	learn: 0.9726270	total: 159ms	remaining: 4.8s
16:	learn: 0.9687727	total: 165ms	remaining: 4.69s
17:	learn: 0.9650116	total: 171ms	remaining: 4.59s
18:	learn: 0.9610813	total: 178ms	remaining: 4.51s
19:	learn: 0.9571130	total: 185ms	remaini

In [18]:
from sklearn.ensemble import StackingRegressor
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# Define the base models with tuned hyperparameters
base_models = [
    ('catboost', CatBoostRegressor(depth=6, iterations=1000, learning_rate=0.01)),
    ('RF', RandomForestRegressor(n_estimators=1000, max_depth=10)),
    ('xgb', XGBRegressor(n_estimators=1000, max_depth=5)),
]

# Define the final estimator
final_estimator = SVR()

# Create the stacking regressor
stacking_regressor = StackingRegressor(
    estimators=base_models,
    final_estimator=final_estimator
)

# Fit the stacking regressor with your training data
stacking_regressor.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = stacking_regressor.predict(X_test)


0:	learn: 1.0324253	total: 4.59ms	remaining: 4.59s
1:	learn: 1.0277069	total: 9.01ms	remaining: 4.5s
2:	learn: 1.0229261	total: 12.9ms	remaining: 4.29s
3:	learn: 1.0182502	total: 16.9ms	remaining: 4.2s
4:	learn: 1.0139121	total: 21ms	remaining: 4.17s
5:	learn: 1.0095738	total: 25ms	remaining: 4.14s
6:	learn: 1.0053610	total: 29.1ms	remaining: 4.12s
7:	learn: 1.0006174	total: 33ms	remaining: 4.09s
8:	learn: 0.9961734	total: 36.9ms	remaining: 4.06s
9:	learn: 0.9917886	total: 40.9ms	remaining: 4.05s
10:	learn: 0.9874793	total: 44.7ms	remaining: 4.02s
11:	learn: 0.9833023	total: 48.5ms	remaining: 3.99s
12:	learn: 0.9793062	total: 52.4ms	remaining: 3.98s
13:	learn: 0.9752791	total: 56.2ms	remaining: 3.96s
14:	learn: 0.9712118	total: 60.2ms	remaining: 3.95s
15:	learn: 0.9671665	total: 64.3ms	remaining: 3.96s
16:	learn: 0.9632169	total: 68.3ms	remaining: 3.95s
17:	learn: 0.9594591	total: 72.2ms	remaining: 3.94s
18:	learn: 0.9554842	total: 76.2ms	remaining: 3.94s
19:	learn: 0.9518307	total: 80

In [28]:
mean_absolute_error(y_test, y_pred)

0.5454605271735853

In [None]:
test_data=feature_extraction(test)
preds=stacking_regressor.predict(test_data)
preds

In [None]:
submission=pd.DataFrame({'id':test_data.index, 'score':preds})
submission.to_csv('submission.csv', index=False)