# Linking Writing processes to writing quality

# Import the necessary packages needed for this analysis

In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler
from catboost import Pool, cv
import seaborn as sns

In [26]:
train_log=pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv')
train_log.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1


In [27]:
train_scores=pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv')
train_scores.head()

Unnamed: 0,id,score
0,001519c8,3.5
1,0022f953,3.5
2,0042269b,6.0
3,0059420b,2.0
4,0075873a,4.0


In [28]:
# lets read in the test data
test=pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv')
test.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,0000aaaa,1,338433,338518,85,Input,Space,Space,,0,0
1,0000aaaa,2,760073,760160,87,Input,Space,Space,,1,0
2,2222bbbb,1,711956,712023,67,Input,q,q,q,0,1
3,2222bbbb,2,290502,290548,46,Input,q,q,q,1,1
4,4444cccc,1,635547,635641,94,Input,Space,Space,,0,0


In [29]:
train_log.columns==test.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

In [30]:
train_log.shape, test.shape

((8405898, 11), (6, 11))

 What I note is that the score is the same. That means that the score is for the final paper. The other variables keep changing.  Lets confirm this by looking at the second student 
 
 In light of this, we have to aggregate the data in terms of  each student. This will make it easy in terms  of the  final predictions. 

# Aggregating the data

## lets start with aggregating the numerical data

In [31]:
# create a function that can be applied also to the test dataset

def aggregation(data):
    numeric = data.select_dtypes('int64')
    numeric['id'] = data['id']
    
    means = numeric.groupby('id').mean()
    stds = numeric.groupby('id').std()
    medians = numeric.groupby('id').median()
    mins = numeric.groupby('id').min()
    maxs = numeric.groupby('id').max()
    
    means.columns = [col + '_means' for col in means.columns]
    stds.columns = [col + '_stds' for col in stds.columns]
    medians.columns = [col + '_medians' for col in medians.columns]
    mins.columns = [col + '_mins' for col in mins.columns]
    maxs.columns = [col + '_maxs' for col in maxs.columns]
    result = pd.concat([means, stds, medians, mins, maxs], axis=1)
    
    return result

    

In [32]:




def character_aggregation(data):
    character_df = data.select_dtypes('object')
    character_df['id'] = data['id']
    result=character_df.groupby('id').nunique()
    
    
    return result

In [35]:
# now we creatfe a final function
def feature_extraction(data):
    numeric_df_clean= aggregation(data)
    character_df_clean=character_aggregation(data)
    result = numeric_df_clean.merge(character_df_clean, on='id', how='inner')
    return  result
train=feature_extraction(train_log)
train=pd.merge(train, train_scores,on='id')
train.drop('id', axis=1, inplace=True)

In [38]:
train=train.dropna()

In [43]:
# Step 1: Calculate the correlation matrix
correlation_matrix = train.corr()

# Step 2: Set a threshold for a "high" correlation
threshold = 0.8  # Adjust this threshold as needed

# Step 3: Identify highly correlated pairs of columns
highly_correlated_columns = set()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) >= threshold:
            highly_correlated_columns.add(correlation_matrix.columns[i])

# Step 4: Create a list of columns to be removed, leaving only one
columns_to_remove = []

for column in highly_correlated_columns:
    columns = correlation_matrix.columns[correlation_matrix[column] >= threshold].tolist()
    if len(columns) > 1:
        columns_to_remove.extend(columns[1:])  # Keep the first column and remove the rest

# Step 5: Remove the columns from the DataFrame
train_cleaned = train.drop(columns=columns_to_remove)

# Display the cleaned DataFrame
print(train_cleaned)

      event_id_means  down_time_means  action_time_means  down_time_stds  \
0             1279.0     8.481808e+05         116.246774   395112.665961   
1             1227.5     5.188553e+05         112.221271   384959.404177   
2             2068.5     8.284918e+05         101.837766   489500.796565   
3              778.5     7.854830e+05         121.848329   385205.014399   
4             1266.0     7.133542e+05         123.943896   405576.409034   
...              ...              ...                ...             ...   
2466          2370.0     7.361019e+05         105.437856   503882.020411   
2467          1302.5     8.419662e+05          82.266129   512744.745940   
2468          1532.0     1.229015e+06          75.605615   514320.848199   
2469          1621.5     5.765185e+05          89.277915   334477.976640   
2470          1810.0     1.076844e+06          83.237082   581107.759299   

      action_time_stds  event_id_mins  down_time_mins  action_time_mins  \
0           

In [47]:
y=train_cleaned['score']
X=train_cleaned.drop(['score'], axis=1)

In [48]:
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.2)

# Fitting a Stacking Regressor with catboost

In [49]:
from sklearn.ensemble import StackingRegressor
from sklearn.svm import SVR
from catboost import CatBoostRegressor

# Define the base models
base_models = [
    ('catboost', CatBoostRegressor(depth=6, iterations=500, learning_rate=0.01)),
    ('cat',  CatBoostRegressor()),
    ('RF3',RandomForestRegressor()),
    ('svr', XGBRegressor()),
    ('xgb', XGBRegressor(n_estimators=200))
]
# Define the final estimator
final_estimator = SVR()  
# Create the stacking regressor
stacking_regressor = StackingRegressor(
    estimators=base_models,
    final_estimator=final_estimator
)
# Fit the stacking regressor with your training data
stacking_regressor.fit(X_train, y_train)
# Make predictions and evaluate the model
y_pred = stacking_regressor.predict(X_test)

0:	learn: 1.0225823	total: 1.82ms	remaining: 906ms
1:	learn: 1.0193801	total: 3.6ms	remaining: 898ms
2:	learn: 1.0156378	total: 4.92ms	remaining: 816ms
3:	learn: 1.0122622	total: 6.39ms	remaining: 792ms
4:	learn: 1.0086839	total: 7.63ms	remaining: 755ms
5:	learn: 1.0049746	total: 8.91ms	remaining: 734ms
6:	learn: 1.0014443	total: 10.2ms	remaining: 715ms
7:	learn: 0.9978613	total: 11.5ms	remaining: 708ms
8:	learn: 0.9941365	total: 12.8ms	remaining: 698ms
9:	learn: 0.9907117	total: 14.1ms	remaining: 689ms
10:	learn: 0.9873594	total: 15.4ms	remaining: 684ms
11:	learn: 0.9842736	total: 16.7ms	remaining: 680ms
12:	learn: 0.9809468	total: 18.1ms	remaining: 676ms
13:	learn: 0.9776521	total: 19.3ms	remaining: 670ms
14:	learn: 0.9743807	total: 20.6ms	remaining: 665ms
15:	learn: 0.9712717	total: 21.8ms	remaining: 660ms
16:	learn: 0.9680292	total: 23.1ms	remaining: 656ms
17:	learn: 0.9649371	total: 24.3ms	remaining: 651ms
18:	learn: 0.9615733	total: 25.6ms	remaining: 647ms
19:	learn: 0.9585022	to

In [50]:
y_preds=stacking_regressor.predict(X_test)
mean_absolute_error(y_test, y_preds)

0.5389324332806544

In [51]:
test_data=feature_extraction(test)
preds=stacking_regressor.predict(test_data)
preds

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- action_time_maxs
- action_time_medians
- cursor_position_maxs
- cursor_position_means
- cursor_position_medians
- ...


In [46]:
submission=pd.DataFrame({'id':test_data.index, 'score':preds})
submission.to_csv('submission.csv', index=False)