# Linking Writing processes to writing quality

# Import the necessary packages needed for this analysis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler

In [None]:
train_log=pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv')
train_log.head()

In [None]:
train_scores=pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv')
train_scores.head()

In [None]:
# lets read in the test data
test=pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv')
test.head()

In [None]:
train_log.columns==test.columns

In [None]:
train_log.shape, test.shape

 What I note is that the score is the same. That means that the score is for the final paper. The other variables keep changing.  Lets confirm this by looking at the second student 
 
 In light of this, we have to aggregate the data in terms of  each student. This will make it easy in terms  of the  final predictions. 

# Aggregating the data

## lets start with aggregating the numerical data

In [None]:
# create a function that can be applied also to the test dataset

def aggregation(data):
    numeric = data.select_dtypes('int64')
    numeric['id'] = data['id']
    
    means = numeric.groupby('id').mean()
    stds = numeric.groupby('id').std()
    medians = numeric.groupby('id').median()
    mins = numeric.groupby('id').min()
    maxs = numeric.groupby('id').max()
    
    means.columns = [col + '_means' for col in means.columns]
    stds.columns = [col + '_stds' for col in stds.columns]
    medians.columns = [col + '_medians' for col in medians.columns]
    mins.columns = [col + '_mins' for col in mins.columns]
    maxs.columns = [col + '_maxs' for col in maxs.columns]
    result = pd.concat([means, stds, medians, mins, maxs], axis=1)
    
    return result

    

In [None]:




def character_aggregation(data):
    character_df = data.select_dtypes('object')
    character_df['id'] = data['id']
    result=character_df.groupby('id').nunique()
    
    
    return result

In [None]:
# now we creatfe a final function
def feature_extraction(data):
    numeric_df_clean= aggregation(data)
    character_df_clean=character_aggregation(data)
    result = numeric_df_clean.merge(character_df_clean, on='id', how='inner')
    return  result
train=feature_extraction(train_log)
    

In [None]:
train.head()

In [None]:
y=train_scores['score']
X=train

In [None]:
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.2)

# Fitting a Stacking Regressor with catboost

In [None]:
meta_regressor =LinearRegression()
base_regressors = [
    ('RF1',CatBoostRegressor()),
    ('RF',CatBoostRegressor())
]

# Create the Stacking Regressor
stacking_regressor = StackingRegressor(estimators=base_regressors, final_estimator=meta_regressor)

# Train the stacking regressor on your data
stacking_regressor.fit(X_train, y_train)



In [None]:
y_preds=stacking_regressor.predict(X_test)
mean_absolute_error(y_test, y_preds)

In [None]:
test

In [None]:
test_data=feature_extraction(test)
preds=stacking_regressor.predict(test_data)
preds

In [None]:
submission=pd.DataFrame({'id':test_data.index, 'score':preds})
submission.to_csv('submission.csv', index=False)