In [1]:
import polars as pl
import pandas as pd
import numpy as np
import gc
import re,torch,os
from collections import Counter,defaultdict
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.linear_model import Lasso, ElasticNet, BayesianRidge, LassoLarsIC
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold, GridSearchCV, train_test_split
from scipy.stats import skew, kurtosis
import warnings
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.metrics import mean_squared_error
warnings.filterwarnings("ignore")
from sklearn.ensemble import StackingRegressor, VotingRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
import lightgbm as lgb


import random
seed=2023
np.random.seed(seed)
random.seed(seed)

In [2]:
train_logs=pd.read_csv("/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv")
print(f"len(train_logs):{len(train_logs)}")
train_logs=train_logs.sort_values(by=['id', 'down_time'])
train_logs = train_logs.reset_index(drop=True)
train_logs['event_id'] = train_logs.groupby('id').cumcount() + 1
train_scores=pd.read_csv("/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv")
test_logs=pd.read_csv("/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv")
print(f"len(test_logs):{len(test_logs)}")
test_logs=test_logs.sort_values(by=['id', 'down_time'])
test_logs = test_logs.reset_index(drop=True)
test_logs['event_id'] = test_logs.groupby('id').cumcount() + 1
test_logs.to_csv("test_logs.csv",index=None)

len(train_logs):8405898
len(test_logs):6


In [3]:
def getEssays(df):
    textInputDf = df[['id', 'activity', 'cursor_position', 'text_change']]
    textInputDf = textInputDf[textInputDf.activity != 'Nonproduction']
    valCountsArr = textInputDf['id'].value_counts(sort=False).values
    lastIndex = 0
    essaySeries = pd.Series()
    
    for index, valCount in enumerate(valCountsArr):
        currTextInput = textInputDf[['activity', 'cursor_position', 'text_change']].iloc[lastIndex : lastIndex + valCount]
        lastIndex += valCount
        essayText = ""
        
        for Input in currTextInput.values:
            if Input[0] == 'Replace':
                replaceTxt = Input[2].split(' => ')
                essayText = essayText[:Input[1] - len(replaceTxt[1])] + replaceTxt[1] + essayText[Input[1] - len(replaceTxt[1]) + len(replaceTxt[0]):]
                continue
            if Input[0] == 'Paste':
                essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]
                continue
            if Input[0] == 'Remove/Cut':
                essayText = essayText[:Input[1]] + essayText[Input[1] + len(Input[2]):]
                continue
            
            if "M" in Input[0]:
                croppedTxt = Input[0][10:]
                splitTxt = croppedTxt.split(' To ')
                valueArr = [item.split(', ') for item in splitTxt]
                moveData = (int(valueArr[0][0][1:]), 
                            int(valueArr[0][1][:-1]), 
                            int(valueArr[1][0][1:]), 
                            int(valueArr[1][1][:-1]))
                
                if moveData[0] != moveData[2]:
                    if moveData[0] < moveData[2]:
                        essayText = essayText[:moveData[0]] + essayText[moveData[1]:moveData[3]] +\
                        essayText[moveData[0]:moveData[1]] + essayText[moveData[3]:]
                    else:
                        essayText = essayText[:moveData[2]] + essayText[moveData[0]:moveData[1]] +\
                        essayText[moveData[2]:moveData[0]] + essayText[moveData[1]:]
                continue
            
            essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]
        
        essaySeries[index] = essayText
    
    essaySeries.index =  textInputDf['id'].unique()
    return pd.DataFrame(essaySeries, columns=['essay']).reset_index().rename(columns={"index":'id'})


In [4]:
def q1(x):
    return x.quantile(0.25)

def q3(x):
    return x.quantile(0.75)

AGGREGATIONS = ['count', 'mean', 'std', 'min', 'max', 'first', 'last', 'sem', q1, 'median', q3, 'skew', kurtosis, 'sum']

def split_essays_into_words(df):
    essay_df = df
    essay_df['word'] = essay_df['essay'].apply(lambda x: re.split(' |\\n|\\.|\\?|\\!', x))
    essay_df = essay_df.explode('word')
    essay_df['word_len'] = essay_df['word'].apply(lambda x: len(x))
    essay_df = essay_df[essay_df['word_len'] != 0]
    return essay_df

def compute_word_aggregations(word_df):
    word_agg_df = word_df[['id','word_len']].groupby(['id']).agg(AGGREGATIONS)
    word_agg_df.columns = ['_'.join(x) for x in word_agg_df.columns]
    word_agg_df['id'] = word_agg_df.index
    
    for word_l in [5, 6, 7, 8, 9, 10, 11, 12]:
        word_agg_df[f'word_len_ge_{word_l}_count'] = word_df[word_df['word_len'] >= word_l].groupby(['id']).count().iloc[:, 0]
        word_agg_df[f'word_len_ge_{word_l}_count'] = word_agg_df[f'word_len_ge_{word_l}_count'].fillna(0)
    
    word_agg_df = word_agg_df.reset_index(drop=True)
    return word_agg_df

def split_essays_into_sentences(df):
    essay_df = df
    essay_df['sent'] = essay_df['essay'].apply(lambda x: re.split('\\.|\\?|\\!', x))
    essay_df = essay_df.explode('sent')
    essay_df['sent'] = essay_df['sent'].apply(lambda x: x.replace('\n', '').strip())
    essay_df['sent_len'] = essay_df['sent'].apply(lambda x: len(x))
    essay_df['sent_word_count'] = essay_df['sent'].apply(lambda x: len(x.split(' ')))
    essay_df = essay_df[essay_df.sent_len != 0].reset_index(drop=True)
    return essay_df

def compute_sentence_aggregations(df):
    sent_agg_df = pd.concat(
        [df[['id','sent_len']].groupby(['id']).agg(AGGREGATIONS), df[['id','sent_word_count']].groupby(['id']).agg(AGGREGATIONS)], axis=1
    )
    sent_agg_df.columns = ['_'.join(x) for x in sent_agg_df.columns]
    sent_agg_df['id'] = sent_agg_df.index
    
    for sent_l in [50, 60, 75, 100]:
        sent_agg_df[f'sent_len_ge_{sent_l}_count'] = df[df['sent_len'] >= sent_l].groupby(['id']).count().iloc[:, 0]
        sent_agg_df[f'sent_len_ge_{sent_l}_count'] = sent_agg_df[f'sent_len_ge_{sent_l}_count'].fillna(0)
    
    sent_agg_df = sent_agg_df.reset_index(drop=True)
    sent_agg_df.drop(columns=["sent_word_count_count"], inplace=True)
    sent_agg_df = sent_agg_df.rename(columns={"sent_len_count":"sent_count"})
    return sent_agg_df

def split_essays_into_paragraphs(df):
    essay_df = df
    essay_df['paragraph'] = essay_df['essay'].apply(lambda x: x.split('\n'))
    essay_df = essay_df.explode('paragraph')
    essay_df['paragraph_len'] = essay_df['paragraph'].apply(lambda x: len(x)) 
    essay_df['paragraph_word_count'] = essay_df['paragraph'].apply(lambda x: len(x.split(' ')))
    essay_df = essay_df[essay_df.paragraph_len != 0].reset_index(drop=True)
    return essay_df

def compute_paragraph_aggregations(df):
    paragraph_agg_df = pd.concat(
        [df[['id','paragraph_len']].groupby(['id']).agg(AGGREGATIONS), df[['id','paragraph_word_count']].groupby(['id']).agg(AGGREGATIONS)], axis=1
    ) 
    paragraph_agg_df.columns = ['_'.join(x) for x in paragraph_agg_df.columns]
    paragraph_agg_df['id'] = paragraph_agg_df.index
    paragraph_agg_df = paragraph_agg_df.reset_index(drop=True)
    paragraph_agg_df.drop(columns=["paragraph_word_count_count"], inplace=True)
    paragraph_agg_df = paragraph_agg_df.rename(columns={"paragraph_len_count":"paragraph_count"})
    return paragraph_agg_df


In [5]:
print("train_essays")
train_essays = pd.read_csv('/kaggle/input/writing-quality-challenge-constructed-essays/train_essays_fast.csv')
print("train_word_agg_df")
train_word_agg_df = compute_word_aggregations(split_essays_into_words(train_essays))
print("train_sent_agg_df")
train_sent_agg_df = compute_sentence_aggregations(split_essays_into_sentences(train_essays))
print("train_paragraph_agg_df")
train_paragraph_agg_df = compute_paragraph_aggregations(split_essays_into_paragraphs(train_essays))
print("test_essays")
test_essays = getEssays(test_logs)
test_essays_copy=test_essays.copy()
print("test_word_agg_df")
test_word_agg_df = compute_word_aggregations(split_essays_into_words(test_essays))
print("test_sent_agg_df")
test_sent_agg_df = compute_sentence_aggregations(split_essays_into_sentences(test_essays))
print("test_paragraph_agg_df")
test_paragraph_agg_df = compute_paragraph_aggregations(split_essays_into_paragraphs(test_essays))

train_essays
train_word_agg_df
train_sent_agg_df
train_paragraph_agg_df
test_essays
test_word_agg_df
test_sent_agg_df
test_paragraph_agg_df


In [6]:
class Preprocessor:
    
    def __init__(self):
        self.activities = ['Input', 'Remove/Cut', 'Nonproduction', 'Replace', 'Paste', 'Move From']
        self.events = ['q', 'Space', 'Backspace', 'Shift', 'ArrowRight', 'Leftclick', 'ArrowLeft', '.', ',',
                       'ArrowDown', 'ArrowUp', 'Enter', 'CapsLock', "'", 'Delete', 'Unidentified']
        self.text_changes = ['q', ' ', 'NoChange', '.', ',', '\n', "'", '"', '-', '?', ';', '=', '/', '\\', ':']
        self.punctuations = ['"', '.', ',', "'", '-', ';', ':', '?', '!', '<', '>', '/', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+']
        self.gaps = [1, 2, 3, 5, 10, 20, 50, 100]
        self.idf = defaultdict(float)

    def activity_counts(self, df):
        tmp_df = df.groupby('id').agg({'activity': list}).reset_index()
        ret = list()
        for li in tmp_df['activity'].values:
            items = list(Counter(li).items())
            di = dict()
            for k in self.activities:
                di[k] = 0
            for item in items:
                k, v = item[0], item[1]
                if k in di:
                    di[k] = v
            ret.append(di)
        ret = pd.DataFrame(ret)
        cols = [f'activity_{i}_count' for i in range(len(ret.columns))]
        ret.columns = cols
        cnts = ret.sum(1)

        for col in cols:
            if col in self.idf.keys():
                idf = self.idf[col]
            else:
                idf = np.log(df.shape[0] / (ret[col].sum() + 1))
                self.idf[col] = idf
            ret[col] = 1 + np.log(ret[col] / cnts)
            ret[col] *= idf

        return ret

    def event_counts(self, df, colname):
        tmp_df = df.groupby('id').agg({colname: list}).reset_index()
        ret = list()
        for li in tmp_df[colname].values:
            items = list(Counter(li).items())
            di = dict()
            for k in self.events:
                di[k] = 0
            for item in items:
                k, v = item[0], item[1]
                if k in di:
                    di[k] = v
            ret.append(di)
        ret = pd.DataFrame(ret)
        cols = [f'{colname}_{i}_count' for i in range(len(ret.columns))]
        ret.columns = cols
        cnts = ret.sum(1)

        for col in cols:
            if col in self.idf.keys():
                idf = self.idf[col]
            else:
                idf = df.shape[0] / (ret[col].sum() + 1)
                idf = np.log(idf)
                self.idf[col] = idf

            ret[col] = 1 + np.log(ret[col] / cnts)
            ret[col] *= idf

        return ret

    def text_change_counts(self, df):
        tmp_df = df.groupby('id').agg({'text_change': list}).reset_index()
        ret = list()
        for li in tmp_df['text_change'].values:
            items = list(Counter(li).items())
            di = dict()
            for k in self.text_changes:
                di[k] = 0
            for item in items:
                k, v = item[0], item[1]
                if k in di:
                    di[k] = v
            ret.append(di)
        ret = pd.DataFrame(ret)
        cols = [f'text_change_{i}_count' for i in range(len(ret.columns))]
        ret.columns = cols
        cnts = ret.sum(1)

        for col in cols:
            if col in self.idf.keys():
                idf = self.idf[col]
            else:
                idf = df.shape[0] / (ret[col].sum() + 1)
                idf = np.log(idf)
                self.idf[col] = idf

            ret[col] = 1 + np.log(ret[col] / cnts)
            ret[col] *= idf

        return ret

    def match_punctuations(self, df):
        tmp_df = df.groupby('id').agg({'down_event': list}).reset_index()
        ret = list()
        for li in tmp_df['down_event'].values:
            cnt = 0
            items = list(Counter(li).items())
            for item in items:
                k, v = item[0], item[1]
                if k in self.punctuations:
                    cnt += v
            ret.append(cnt)
        ret = pd.DataFrame({'punct_cnt': ret})
        return ret

    def get_input_words(self, df):
        tmp_df = df[(~df['text_change'].str.contains('=>')) & (df['text_change'] != 'NoChange')].reset_index(drop=True)
        tmp_df = tmp_df.groupby('id').agg({'text_change': list}).reset_index()
        tmp_df['text_change'] = tmp_df['text_change'].apply(lambda x: ''.join(x))
        tmp_df['text_change'] = tmp_df['text_change'].apply(lambda x: re.findall(r'q+', x))
        tmp_df['input_word_count'] = tmp_df['text_change'].apply(len)
        tmp_df['input_word_length_mean'] = tmp_df['text_change'].apply(lambda x: np.mean([len(i) for i in x] if len(x) > 0 else 0))
        tmp_df['input_word_length_max'] = tmp_df['text_change'].apply(lambda x: np.max([len(i) for i in x] if len(x) > 0 else 0))
        tmp_df['input_word_length_std'] = tmp_df['text_change'].apply(lambda x: np.std([len(i) for i in x] if len(x) > 0 else 0))
        tmp_df.drop(['text_change'], axis=1, inplace=True)
        return tmp_df

    def make_feats(self, df):
        print("Starting to engineer features")
        feats = pd.DataFrame({'id': df['id'].unique().tolist()})
        print("Engineering time data")
        for gap in self.gaps:
            print(f"-> for gap {gap}")
            df[f'up_time_shift{gap}'] = df.groupby('id')['up_time'].shift(gap)
            df[f'action_time_gap{gap}'] = df['down_time'] - df[f'up_time_shift{gap}']
        df.drop(columns=[f'up_time_shift{gap}' for gap in self.gaps], inplace=True)

        print("Engineering cursor position data")
        for gap in self.gaps:
            print(f"-> for gap {gap}")
            df[f'cursor_position_shift{gap}'] = df.groupby('id')['cursor_position'].shift(gap)
            df[f'cursor_position_change{gap}'] = df['cursor_position'] - df[f'cursor_position_shift{gap}']
            df[f'cursor_position_abs_change{gap}'] = np.abs(df[f'cursor_position_change{gap}'])
        df.drop(columns=[f'cursor_position_shift{gap}' for gap in self.gaps], inplace=True)

        print("Engineering word count data")
        for gap in self.gaps:
            print(f"-> for gap {gap}")
            df[f'word_count_shift{gap}'] = df.groupby('id')['word_count'].shift(gap)
            df[f'word_count_change{gap}'] = df['word_count'] - df[f'word_count_shift{gap}']
            df[f'word_count_abs_change{gap}'] = np.abs(df[f'word_count_change{gap}'])
        df.drop(columns=[f'word_count_shift{gap}' for gap in self.gaps], inplace=True)

        print("Engineering statistical summaries for features")
        feats_stat = [
            ('event_id', ['max']),
            ('down_time',['mean', 'std', 'min', 'max', 'last', 'first', 'sem', 'median', 'sum']),
            ('up_time',['mean', 'std', 'min', 'max', 'last', 'first', 'sem', 'median', 'sum']),
            ('action_time', ['max', 'min', 'mean', 'std', 'quantile', 'sem', 'sum', 'skew', pd.DataFrame.kurt,'last', 'first','median']),
            ('activity', ['nunique']),
            ('down_event', ['nunique']),
            ('up_event', ['nunique']),
            ('text_change', ['nunique']),
            ('cursor_position', ['nunique', 'max', 'quantile', 'sem', 'mean', 'std', 'min','last', 'first',  'median', 'sum']),
            ('word_count', ['nunique', 'max', 'quantile', 'sem', 'mean', 'std', 'min', 'last', 'first','median', 'sum'])]
        for gap in self.gaps:
            feats_stat.extend([
                (f'action_time_gap{gap}', ['max', 'min', 'mean', 'std', 'quantile', 'sem', 'sum', 'skew', pd.DataFrame.kurt]),
                (f'cursor_position_change{gap}', ['max', 'mean', 'std', 'quantile', 'sem', 'sum', 'skew', pd.DataFrame.kurt]),
                (f'word_count_change{gap}', ['max', 'mean', 'std', 'quantile', 'sem', 'sum', 'skew', pd.DataFrame.kurt])
            ])

        pbar = feats_stat
        for item in pbar:
            colname, methods = item[0], item[1]
            for method in methods:
                if isinstance(method, str):
                    method_name = method
                else:
                    method_name = method.__name__
                tmp_df = df.groupby(['id']).agg({colname: method}).reset_index().rename(columns={colname: f'{colname}_{method_name}'})
                feats = feats.merge(tmp_df, on='id', how='left')

        print("Engineering activity counts data")
        tmp_df = self.activity_counts(df)
        feats = pd.concat([feats, tmp_df], axis=1)
        print("Engineering event counts data")
        tmp_df = self.event_counts(df, 'down_event')
        feats = pd.concat([feats, tmp_df], axis=1)
        tmp_df = self.event_counts(df, 'up_event')
        feats = pd.concat([feats, tmp_df], axis=1)

        print("Engineering text change counts data")
        tmp_df = self.text_change_counts(df)
        feats = pd.concat([feats, tmp_df], axis=1)

        print("Engineering punctuation counts data")
        tmp_df = self.match_punctuations(df)
        feats = pd.concat([feats, tmp_df], axis=1)

        print("Engineering input words data")
        tmp_df = self.get_input_words(df)
        feats = pd.merge(feats, tmp_df, on='id', how='left')

        print("Engineering ratios data")
        feats['word_time_ratio'] = feats['word_count_max'] / feats['up_time_max']
        feats['word_event_ratio'] = feats['word_count_max'] / feats['event_id_max']
        feats['event_time_ratio'] = feats['event_id_max']  / feats['up_time_max']
        feats['idle_time_ratio'] = feats['action_time_gap1_sum'] / feats['up_time_max']
        
        print("Done!")
        return feats

preprocessor = Preprocessor()
print("Engineering features for training data")
train_feats = preprocessor.make_feats(train_logs)
print("-" * 25)
print("Engineering features for test data")
test_feats = preprocessor.make_feats(test_logs)


Engineering features for training data
Starting to engineer features
Engineering time data
-> for gap 1
-> for gap 2
-> for gap 3
-> for gap 5
-> for gap 10
-> for gap 20
-> for gap 50
-> for gap 100
Engineering cursor position data
-> for gap 1
-> for gap 2
-> for gap 3
-> for gap 5
-> for gap 10
-> for gap 20
-> for gap 50
-> for gap 100
Engineering word count data
-> for gap 1
-> for gap 2
-> for gap 3
-> for gap 5
-> for gap 10
-> for gap 20
-> for gap 50
-> for gap 100
Engineering statistical summaries for features
Engineering activity counts data
Engineering event counts data
Engineering text change counts data
Engineering punctuation counts data
Engineering input words data
Engineering ratios data
Done!
-------------------------
Engineering features for test data
Starting to engineer features
Engineering time data
-> for gap 1
-> for gap 2
-> for gap 3
-> for gap 5
-> for gap 10
-> for gap 20
-> for gap 50
-> for gap 100
Engineering cursor position data
-> for gap 1
-> for gap 2

In [7]:
data = []

for logs in [train_logs, test_logs]:
    logs['up_time_lagged'] = logs.groupby('id')['up_time'].shift(1).fillna(logs['down_time'])
    logs['time_diff'] = abs(logs['down_time'] - logs['up_time_lagged']) / 1000

    group = logs.groupby('id')['time_diff']
    largest_lantency = group.max()
    smallest_lantency = group.min()
    median_lantency = group.median()
    initial_pause = logs.groupby('id')['down_time'].first() / 1000
    pauses_half_sec = group.apply(lambda x: ((x > 0.5) & (x <= 1)).sum())
    pauses_1_sec = group.apply(lambda x: ((x > 1) & (x <= 1.5)).sum())
    pauses_1_half_sec = group.apply(lambda x: ((x > 1.5) & (x <= 2)).sum())
    pauses_2_sec = group.apply(lambda x: ((x > 2) & (x <= 3)).sum())
    pauses_3_sec = group.apply(lambda x: (x > 3).sum())

    data.append(pd.DataFrame({
        'id': logs['id'].unique(),
        'largest_lantency': largest_lantency,
        'smallest_lantency': smallest_lantency,
        'median_lantency': median_lantency,
        'initial_pause': initial_pause,
        'pauses_half_sec': pauses_half_sec,
        'pauses_1_sec': pauses_1_sec,
        'pauses_1_half_sec': pauses_1_half_sec,
        'pauses_2_sec': pauses_2_sec,
        'pauses_3_sec': pauses_3_sec,
    }).reset_index(drop=True))

train_eD592674, test_eD592674 = data

gc.collect()

train_feats = train_feats.merge(train_eD592674, on='id', how='left')
test_feats = test_feats.merge(test_eD592674, on='id', how='left')
train_feats = train_feats.merge(train_scores, on='id', how='left')


In [8]:
train_feats=train_feats.merge(train_word_agg_df,on='id', how='left')
train_feats=train_feats.merge(train_sent_agg_df,on='id', how='left')
train_feats=train_feats.merge(train_paragraph_agg_df,on='id', how='left')

test_feats=test_feats.merge(test_word_agg_df,on='id', how='left')
test_feats=test_feats.merge(test_sent_agg_df,on='id', how='left')
test_feats=test_feats.merge(test_paragraph_agg_df,on='id', how='left')

In [9]:
keys=train_feats.keys().values
unique_cols=[key for key in keys if train_feats[key].nunique()<2]
print(f"unique_cols:{unique_cols}")
train_feats = train_feats.drop(columns=unique_cols)
test_feats = test_feats.drop(columns=unique_cols)

unique_cols:['cursor_position_min', 'word_count_change1_quantile', 'word_count_change2_quantile', 'activity_5_count', 'smallest_lantency']


In [10]:
# train_feats = train_feats.drop('score',axis=1)
# train_feats = pd.concat([train_feats, train_scores['score']], axis=1)

# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras import layers, models 
# from tensorflow.keras.layers import Dense, Reshape, LeakyReLU
# from tensorflow.keras.optimizers import Adam 
# from tensorflow.keras.losses import MeanSquaredError

# generator = Sequential()
# generator.add(Dense(128, input_shape=(100,)))
# generator.add(LeakyReLU(alpha=0.01)) 
# generator.add(Dense(256))
# generator.add(LeakyReLU(alpha=0.01))
# generator.add(Dense(404)) 

# discriminator = Sequential()
# discriminator.add(Dense(256, input_shape=(404,)))
# discriminator.add(LeakyReLU(alpha=0.01))
# discriminator.add(Dense(128))
# discriminator.add(LeakyReLU(alpha=0.01))
# discriminator.add(Dense(1))

# gan_model = Sequential()
# gan_model.add(generator)
# discriminator.trainable = False
# gan_model.add(discriminator)

# discriminator.compile(loss=MeanSquaredError(), optimizer=Adam(learning_rate=0.0002, beta_1=0.5))
# gan_model.compile(loss=MeanSquaredError(), optimizer=Adam(learning_rate=0.0002, beta_1=0.5))

# epochs = 1000
# batch_size = 128

# for epoch in range(epochs):
#     idx = np.random.randint(0, train_feats.shape[0], batch_size)
#     real_samples = train_feats.drop(columns=['id']).iloc[idx].values
#     fake_samples = generator.predict(np.random.rand(batch_size, 100))
#     real_labels = real_samples 
#     fake_labels = fake_samples
#     real_samples_tf = tf.convert_to_tensor(real_samples, dtype=tf.float32)
#     fake_samples_tf = tf.convert_to_tensor(fake_samples, dtype=tf.float32)
    
    
#     d_loss_real = discriminator.train_on_batch(real_samples_tf, real_labels)
#     d_loss_fake = discriminator.train_on_batch(fake_samples_tf, fake_labels )
#     d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
    
#     noise = np.random.rand(batch_size, 100)
#     valid_labels = noise
#     g_loss = gan_model.train_on_batch(noise, valid_labels)
    
#     if epoch % 100 == 0:
#         print(f"Epoch {epoch}, D Loss: {d_loss}, G_Loss: {g_loss}")

 

# generated_feats = generator.predict(np.random.rand(2471,100))

# generated_ids = range(1, len(generated_feats) + 1)
# df_generated = pd.DataFrame(generated_feats, columns=train_feats.columns[1:])
# df_generated['id'] = [f"{i:08d}" for i in generated_ids]

# train_feats = pd.concat([train_feats, df_generated], axis=0, ignore_index=True)

In [11]:
def make_pipeline(model):
    return Pipeline([
        ('remove_infs', FunctionTransformer(lambda x: np.nan_to_num(x, nan=np.nan, posinf=0, neginf=0))),
        ('imputer', SimpleImputer(strategy='mean')),
        ('normalizer', FunctionTransformer(lambda x: np.log1p(np.abs(x)))),
        ('scaler', RobustScaler()),
        ('model', model)
    ])

def train_valid_split(data_x, data_y, train_idx, valid_idx):
    x_train = data_x.iloc[train_idx]
    y_train = data_y[train_idx]
    x_valid = data_x.iloc[valid_idx]
    y_valid = data_y[valid_idx]
    return x_train, y_train, x_valid, y_valid

In [12]:
lasso = make_pipeline(
    Lasso(
        alpha=0.001,
        fit_intercept=True,
        max_iter=50,
        selection='random',
        random_state=42
    )
)

ENet = make_pipeline(
    ElasticNet(
        alpha=0.1,
        fit_intercept=True,
        l1_ratio=0.1,
        max_iter=100,
        random_state=42
    )
)

GBoost = make_pipeline(
    GradientBoostingRegressor(
        learning_rate=0.1,
        max_depth=3,
        min_samples_leaf=4,
        min_samples_split=2,
        n_estimators=50,
        subsample=0.8,
        random_state=42
    )
)

ridge = make_pipeline(
    Ridge(
        alpha=1.0,
        fit_intercept=True,
        solver='svd'
    )
)

lr = make_pipeline(
    LinearRegression(
        fit_intercept=True,
        positive=False
    )
)

rf = make_pipeline(
    RandomForestRegressor(
        bootstrap=True,
        max_depth=20,
        max_features='auto',
        min_samples_leaf=2,
        min_samples_split=10,
        n_estimators=200,
        random_state=42
    )
)

svr = make_pipeline(
    SVR(
        kernel='rbf',
        C=100,
        epsilon=0.1
    )
)


In [13]:
cb_param = {
    'n_estimators': 2424,
    'depth': 7,
    'od_wait': 15,
    'l2_leaf_reg': 5.7698693615696754,
    'colsample_bylevel': 0.7197709029314262,
    'subsample': 0.46524030005499395,
    'learning_rate': 0.009934387110797558,
    'bagging_temperature': 0.3562146562395841,
    'border_count': 255,
    'min_data_in_leaf': 79,
    'random_state': 42,
    'verbose': 0
}
cbr = CatBoostRegressor(**cb_param)

xgb_param = {
    'objective': 'reg:squarederror',
    'n_estimators': 1002,
    'learning_rate': 0.02670836201283028,
    'max_depth': 4,
    'subsample': 0.487752178166985,
    'colsample_bytree': 0.45093000793609206,
    'colsample_bylevel': 0.71,
    'eta': 0.011957851912348194,
    'gamma': 0.0022864202534752924,
    'reg_alpha': 0.02437079001480029,
    'reg_lambda': 6.834478468800099,
    'max_delta_step': 8,
    'min_child_weight': 0.32060573402630715,
    'random_state': 42,
    'eval_metric': 'rmse',
    'verbose' : 1,
}          
xgbr = XGBRegressor(**xgb_param)

lgbm_param = {
    'reg_alpha': 0.5087856916700971,
    'reg_lambda': 0.2405984837342317,
    'colsample_bytree': 0.8378257913675838,
    'subsample': 0.5411528279742139,
    'learning_rate': 0.016140199269271273,
    'max_depth': 40,
    'num_leaves': 15,
    'min_child_samples': 13,
    'bagging_fraction': 0.8410863683391213,
    'lambda_l1': 1.8151102699226822e-06,
    'lambda_l2': 0.017602890537921387,
    'bagging_freq': 2,
    'n_estimators': 1024,
    'metric': 'rmse',
    'random_state': 42,
    'force_col_wise': True,
    'verbosity': 1,
}
lgbmr = LGBMRegressor(**lgbm_param)

In [14]:
stack_gen = StackingRegressor(
    estimators=[
        ('enet', ENet),
        ('lasso', lasso),
        ('ridge', ridge),
        ('gboost', GBoost),
        ('rf', rf),
        ('cbr', cbr),
        ('svr', svr) 
    ], 
    final_estimator=lr, 
    passthrough=False,
)

In [15]:
train_feats.replace([np.inf, -np.inf], np.nan, inplace=True)

x              = train_feats.drop(['id', 'score'], axis=1)
y              = train_feats['score'].values

test_ids = test_feats['id'].values
testin_x = test_feats.drop(['id'], axis=1)

In [16]:
n_splits = 10 
random_state = 42
early_stopping_rounds = 400
skf = StratifiedKFold(n_splits=n_splits, random_state=random_state, shuffle=True)

In [17]:
xgb_oof = []  
df_importance_list_xgb = []  
ycol = 'score'
feature_names = list(filter(lambda x: x not in [ycol, 'id'], train_feats.columns))

for i, (train_index, valid_index) in enumerate(skf.split(x, y.astype(str))):
    train_x, train_y, valid_x, valid_y = train_valid_split(x, y, train_index, valid_index)
    
    print('\nFold_{} Training'.format(i+1))
    xgbr.fit(train_x, 
              train_y,
              eval_set=[(train_x, train_y), (valid_x, valid_y)],
              early_stopping_rounds=400,
              verbose=10)
    xgbr_pred = xgbr.predict(valid_x)
    
    df_xgb_oof = train_feats.iloc[valid_index][['id', ycol]].copy()
    df_xgb_oof['pred'] = xgbr_pred
    xgb_oof.append(df_xgb_oof)
    
    xgb_preds = xgbr.predict(testin_x)
    df_importance_xgb = pd.DataFrame({
        'column': feature_names,
        'importance': xgbr.feature_importances_,
    })
    df_importance_list_xgb.append(df_importance_xgb)

df_importance_xgb = pd.concat(df_importance_list_xgb)
df_importance_xgb = df_importance_xgb.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()



Fold_1 Training
[0]	validation_0-rmse:1.00887	validation_1-rmse:1.01504
[10]	validation_0-rmse:0.87988	validation_1-rmse:0.89172
[20]	validation_0-rmse:0.78590	validation_1-rmse:0.80209
[30]	validation_0-rmse:0.71682	validation_1-rmse:0.73715
[40]	validation_0-rmse:0.66830	validation_1-rmse:0.69464
[50]	validation_0-rmse:0.63241	validation_1-rmse:0.66401
[60]	validation_0-rmse:0.60426	validation_1-rmse:0.64434
[70]	validation_0-rmse:0.58415	validation_1-rmse:0.63024
[80]	validation_0-rmse:0.56817	validation_1-rmse:0.62203
[90]	validation_0-rmse:0.55491	validation_1-rmse:0.61333
[100]	validation_0-rmse:0.54347	validation_1-rmse:0.60851
[110]	validation_0-rmse:0.53433	validation_1-rmse:0.60455
[120]	validation_0-rmse:0.52593	validation_1-rmse:0.60249
[130]	validation_0-rmse:0.51842	validation_1-rmse:0.59990
[140]	validation_0-rmse:0.51135	validation_1-rmse:0.59861
[150]	validation_0-rmse:0.50553	validation_1-rmse:0.59652
[160]	validation_0-rmse:0.49958	validation_1-rmse:0.59612
[170]	va

In [18]:
lgb_oof = []  
df_importance_list_lgb = []  
ycol = 'score'
feature_names = list(filter(lambda x: x not in [ycol, 'id'], train_feats.columns))

for i, (train_index, valid_index) in enumerate(skf.split(x, y.astype(str))):
    train_x, train_y, valid_x, valid_y = train_valid_split(x, y, train_index, valid_index)
    
    print('\nFold_{} Training'.format(i+1))
    lgbmr.fit(train_x, 
              train_y,
              eval_metric='rmse',
              eval_set=[(train_x, train_y), (valid_x, valid_y)],
              early_stopping_rounds=400,
              verbose=10)
    
    lgbmr_pred = lgbmr.predict(valid_x)
    
    df_lgb_oof = train_feats.iloc[valid_index][['id', ycol]].copy()  
    df_lgb_oof['pred'] = lgbmr_pred
    lgb_oof.append(df_lgb_oof)  
    pred_test_lgb = lgbmr.predict(testin_x)
    
    df_importance_lgb = pd.DataFrame({
        'column': feature_names,
        'importance': lgbmr.feature_importances_,
    })
    
    df_importance_list_lgb.append(df_importance_lgb)

df_importance_lgb = pd.concat(df_importance_list_lgb)
df_importance_lgb = df_importance_lgb.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()


Fold_1 Training
[LightGBM] [Info] Total Bins 86881
[LightGBM] [Info] Number of data points in the train set: 2223, number of used features: 398
[LightGBM] [Info] Start training from score 3.710976
[10]	training's rmse: 0.929748	valid_1's rmse: 0.94261
[20]	training's rmse: 0.852879	valid_1's rmse: 0.870942
[30]	training's rmse: 0.790783	valid_1's rmse: 0.813414
[40]	training's rmse: 0.739501	valid_1's rmse: 0.76667
[50]	training's rmse: 0.697762	valid_1's rmse: 0.729205
[60]	training's rmse: 0.663776	valid_1's rmse: 0.697687
[70]	training's rmse: 0.635938	valid_1's rmse: 0.673727
[80]	training's rmse: 0.612001	valid_1's rmse: 0.655508
[90]	training's rmse: 0.592646	valid_1's rmse: 0.641934
[100]	training's rmse: 0.576151	valid_1's rmse: 0.631156
[110]	training's rmse: 0.562293	valid_1's rmse: 0.622034
[120]	training's rmse: 0.549926	valid_1's rmse: 0.613795
[130]	training's rmse: 0.538579	valid_1's rmse: 0.607248
[140]	training's rmse: 0.528672	valid_1's rmse: 0.602414
[150]	training'

In [19]:
stack_oof = []
df_importance_list_stack = []

ycol = 'score'
feature_names = list(filter(lambda x: x not in [ycol, 'id'], train_feats.columns))

skf1 = StratifiedKFold(n_splits=6, random_state=random_state, shuffle=True)

# stack_gen.fit(x,y)
# pred_test_stack = stack_gen.predict(testin_x)

for i, (train_index, valid_index) in enumerate(skf1.split(x, y.astype(str))):
    train_x, train_y, valid_x, valid_y = train_valid_split(x, y, train_index, valid_index)
    
    print('\nFold_{} Training (Stacking)'.format(i+1))
    stack_gen.fit(train_x, train_y)
    stack_gen_pred = stack_gen.predict(valid_x)
    
    df_stack_oof = train_feats.iloc[valid_index][['id', ycol]].copy()
    df_stack_oof['pred'] = stack_gen_pred
    stack_oof.append(df_stack_oof)
    pred_test_stack = stack_gen.predict(testin_x)


Fold_1 Training (Stacking)

Fold_2 Training (Stacking)

Fold_3 Training (Stacking)

Fold_4 Training (Stacking)

Fold_5 Training (Stacking)

Fold_6 Training (Stacking)


In [20]:
df_xgb_oof = pd.concat(xgb_oof)
rmse = mean_squared_error(df_xgb_oof[ycol], df_xgb_oof['pred'], squared=False)
print('xgb rmse:', rmse)

df_lgb_oof = pd.concat(lgb_oof)
rmse = mean_squared_error(df_lgb_oof[ycol], df_lgb_oof['pred'], squared=False)
print('lgb rmse:', rmse)

df_stack_oof = pd.concat(stack_oof)
rmse = mean_squared_error(df_stack_oof[ycol], df_stack_oof['pred'], squared=False)
print('stack rmse:', rmse)

xgb rmse: 0.6098904186645924
lgb rmse: 0.6095817394643686
stack rmse: 0.609468539705985


In [21]:
xgb_oof_pred = df_xgb_oof['pred'].values
lgb_oof_pred = df_lgb_oof['pred'].values
stack_oof_pred = df_stack_oof['pred'].values

margin = 1000

current_RMSE = mean_squared_error(y, (xgb_oof_pred + lgb_oof_pred + stack_oof_pred) / 3, squared=False)

best_i = 0
best_j = 0

for i in range(0, margin):
    for j in range(0, margin - i):
        blend_oof_pred = (i * xgb_oof_pred + j * lgb_oof_pred + (margin - i - j) * stack_oof_pred) / margin
        
        if mean_squared_error(y, blend_oof_pred, squared=False) < current_RMSE:
            print(f"Current RMSE: {current_RMSE}")
            current_RMSE = mean_squared_error(y, blend_oof_pred, squared=False)
            best_i = i
            best_j = j

blending_weights = {
    'xgb': best_i / margin,
    'lgb': best_j / margin,
    'stack': (margin - best_i - best_j) / margin
}

print(f"Blending weights: {blending_weights}")

Current RMSE: 1.1876888060430648
Current RMSE: 1.1875364201864274
Current RMSE: 1.1873726614533784
Current RMSE: 1.1872099799480302
Current RMSE: 1.1870483761132755
Current RMSE: 1.1868878503893134
Current RMSE: 1.186728403213644
Current RMSE: 1.1865700350210617
Current RMSE: 1.186412746243651
Current RMSE: 1.1862565373107785
Current RMSE: 1.186101408649089
Current RMSE: 1.1859473606825002
Current RMSE: 1.1857943938321944
Current RMSE: 1.185642508516616
Current RMSE: 1.1854917051514637
Current RMSE: 1.1853419841496862
Current RMSE: 1.1851933459214763
Current RMSE: 1.1850457908742649
Current RMSE: 1.1848993194127166
Current RMSE: 1.1847539319387232
Current RMSE: 1.184609628851399
Current RMSE: 1.1844664105470755
Current RMSE: 1.184324277419295
Current RMSE: 1.1841832298588073
Current RMSE: 1.1840432682535622
Current RMSE: 1.1839043929887059
Current RMSE: 1.183766604446575
Current RMSE: 1.1836299030066921
Current RMSE: 1.1834942890457594
Current RMSE: 1.1833597629376547
Current RMSE: 1.1

In [22]:
blended_preds = (
    blending_weights['xgb'] * xgb_preds +
    blending_weights['lgb'] * pred_test_lgb +
    blending_weights['stack'] * pred_test_stack
)

blended_preds = np.clip(a=blended_preds, a_min=0.0, a_max=6.0)

sub1 = pd.DataFrame({'id': test_ids, 'score': blended_preds})
sub1.to_csv('submission.csv', index=False)
sub1

Unnamed: 0,id,score
0,0000aaaa,1.773148
1,2222bbbb,2.689286
2,4444cccc,2.598804


# SILVER BULLET 

In [23]:
DEBUG = True
SEED = 42
N_FOLD = 5

In [24]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
seed_everything(SEED)

In [25]:
num_cols = ['down_time', 'up_time', 'action_time', 'cursor_position', 'word_count']
activities = ['Input', 'Remove/Cut', 'Nonproduction', 'Replace', 'Paste']
events = ['q', 'Space', 'Backspace', 'Shift', 'ArrowRight', 'Leftclick', 'ArrowLeft', '.', ',', 'ArrowDown', 'ArrowUp', 'Enter', 'CapsLock', "'", 'Delete', 'Unidentified']
text_changes = ['q', ' ', '.', ',', '\n', "'", '"', '-', '?', ';', '=', '/', '\\', ':']

def count_by_values(df, colname, values):
    fts = df.select(pl.col('id').unique(maintain_order=True))
    for i, value in enumerate(values):
        tmp_df = df.group_by('id').agg(pl.col(colname).is_in([value]).sum().alias(f'{colname}_{i}_cnt'))
        fts  = fts.join(tmp_df, on='id', how='left') 
    return fts

def dev_feats(df):
    
    print("< Count by values features >")
    
    feats = count_by_values(df, 'activity', activities)
    feats = feats.join(count_by_values(df, 'text_change', text_changes), on='id', how='left') 
    feats = feats.join(count_by_values(df, 'down_event', events), on='id', how='left') 
    feats = feats.join(count_by_values(df, 'up_event', events), on='id', how='left') 

    print("< Input words stats features >")
    temp = df.filter((~pl.col('text_change').str.contains('=>')) & (pl.col('text_change') != 'NoChange'))
    temp = temp.group_by('id').agg(pl.col('text_change').str.concat('').str.extract_all(r'q+'))
    temp = temp.with_columns(input_word_count = pl.col('text_change').list.lengths(),
                             input_word_length_mean = pl.col('text_change').apply(lambda x: np.mean([len(i) for i in x] if len(x) > 0 else 0)),
                             input_word_length_max = pl.col('text_change').apply(lambda x: np.max([len(i) for i in x] if len(x) > 0 else 0)),
                             input_word_length_std = pl.col('text_change').apply(lambda x: np.std([len(i) for i in x] if len(x) > 0 else 0)),
                             input_word_length_median = pl.col('text_change').apply(lambda x: np.median([len(i) for i in x] if len(x) > 0 else 0)),
                             input_word_length_skew = pl.col('text_change').apply(lambda x: skew([len(i) for i in x] if len(x) > 0 else 0)))
    temp = temp.drop('text_change')
    feats = feats.join(temp, on='id', how='left') 

    print("< Numerical columns features >")

    temp = df.group_by("id").agg(pl.sum('action_time').suffix('_sum'), pl.mean(num_cols).suffix('_mean'), pl.std(num_cols).suffix('_std'),
                                 pl.median(num_cols).suffix('_median'), pl.min(num_cols).suffix('_min'), pl.max(num_cols).suffix('_max'),
                                 pl.quantile(num_cols, 0.5).suffix('_quantile'))
    feats = feats.join(temp, on='id', how='left') 


    print("< Categorical columns features >")
    temp  = df.group_by("id").agg(pl.n_unique(['activity', 'down_event', 'up_event', 'text_change']))
    feats = feats.join(temp, on='id', how='left') 

    print("< Idle time features >")
    temp = df.with_columns(pl.col('up_time').shift().over('id').alias('up_time_lagged'))
    temp = temp.with_columns((abs(pl.col('down_time') - pl.col('up_time_lagged')) / 1000).fill_null(0).alias('time_diff'))
    temp = temp.filter(pl.col('activity').is_in(['Input', 'Remove/Cut']))
    temp = temp.group_by("id").agg(inter_key_largest_lantency = pl.max('time_diff'),
                                   inter_key_median_lantency = pl.median('time_diff'),
                                   mean_pause_time = pl.mean('time_diff'),
                                   std_pause_time = pl.std('time_diff'),
                                   total_pause_time = pl.sum('time_diff'),
                                   pauses_half_sec = pl.col('time_diff').filter((pl.col('time_diff') > 0.5) & (pl.col('time_diff') < 1)).count(),
                                   pauses_1_sec = pl.col('time_diff').filter((pl.col('time_diff') > 1) & (pl.col('time_diff') < 1.5)).count(),
                                   pauses_1_half_sec = pl.col('time_diff').filter((pl.col('time_diff') > 1.5) & (pl.col('time_diff') < 2)).count(),
                                   pauses_2_sec = pl.col('time_diff').filter((pl.col('time_diff') > 2) & (pl.col('time_diff') < 3)).count(),
                                   pauses_3_sec = pl.col('time_diff').filter(pl.col('time_diff') > 3).count(),)
    feats = feats.join(temp, on='id', how='left') 
    
    print("< P-bursts features >")
    temp = df.with_columns(pl.col('up_time').shift().over('id').alias('up_time_lagged'))
    temp = temp.with_columns((abs(pl.col('down_time') - pl.col('up_time_lagged')) / 1000).fill_null(0).alias('time_diff'))
    temp = temp.filter(pl.col('activity').is_in(['Input', 'Remove/Cut']))
    temp = temp.with_columns(pl.col('time_diff')<2)
    temp = temp.with_columns(pl.when(pl.col("time_diff") & pl.col("time_diff").is_last()).then(pl.count()).over(pl.col("time_diff").rle_id()).alias('P-bursts'))
    temp = temp.drop_nulls()
    temp = temp.group_by("id").agg(pl.mean('P-bursts').suffix('_mean'), pl.std('P-bursts').suffix('_std'), pl.count('P-bursts').suffix('_count'),
                                   pl.median('P-bursts').suffix('_median'), pl.max('P-bursts').suffix('_max'),
                                   pl.first('P-bursts').suffix('_first'), pl.last('P-bursts').suffix('_last'))
    feats = feats.join(temp, on='id', how='left') 

    print("< R-bursts features >")
    temp = df.filter(pl.col('activity').is_in(['Input', 'Remove/Cut']))
    temp = temp.with_columns(pl.col('activity').is_in(['Remove/Cut']))
    temp = temp.with_columns(pl.when(pl.col("activity") & pl.col("activity").is_last()).then(pl.count()).over(pl.col("activity").rle_id()).alias('R-bursts'))
    temp = temp.drop_nulls()
    temp = temp.group_by("id").agg(pl.mean('R-bursts').suffix('_mean'), pl.std('R-bursts').suffix('_std'), 
                                   pl.median('R-bursts').suffix('_median'), pl.max('R-bursts').suffix('_max'),
                                   pl.first('R-bursts').suffix('_first'), pl.last('R-bursts').suffix('_last'))
    feats = feats.join(temp, on='id', how='left')
    
    return feats

In [26]:
AGGREGATIONS = ['count', 'mean', 'min', 'max', 'first', 'last', q1, 'median', q3, 'sum']

def word_feats(df):
    essay_df = df
    df['word'] = df['essay'].apply(lambda x: re.split(' |\\n|\\.|\\?|\\!',x))
    df = df.explode('word')
    df['word_len'] = df['word'].apply(lambda x: len(x))
    df = df[df['word_len'] != 0]

    word_agg_df = df[['id','word_len']].groupby(['id']).agg(AGGREGATIONS)
    word_agg_df.columns = ['_'.join(x) for x in word_agg_df.columns]
    word_agg_df['id'] = word_agg_df.index
    word_agg_df = word_agg_df.reset_index(drop=True)
    return word_agg_df

def sent_feats(df):
    df['sent'] = df['essay'].apply(lambda x: re.split('\\.|\\?|\\!',x))
    df = df.explode('sent')
    df['sent'] = df['sent'].apply(lambda x: x.replace('\n','').strip())
    df['sent_len'] = df['sent'].apply(lambda x: len(x))
    df['sent_word_count'] = df['sent'].apply(lambda x: len(x.split(' ')))
    df = df[df.sent_len!=0].reset_index(drop=True)

    sent_agg_df = pd.concat([df[['id','sent_len']].groupby(['id']).agg(AGGREGATIONS), 
                             df[['id','sent_word_count']].groupby(['id']).agg(AGGREGATIONS)], axis=1)
    sent_agg_df.columns = ['_'.join(x) for x in sent_agg_df.columns]
    sent_agg_df['id'] = sent_agg_df.index
    sent_agg_df = sent_agg_df.reset_index(drop=True)
    sent_agg_df.drop(columns=["sent_word_count_count"], inplace=True)
    sent_agg_df = sent_agg_df.rename(columns={"sent_len_count":"sent_count"})
    return sent_agg_df

def parag_feats(df):
    df['paragraph'] = df['essay'].apply(lambda x: x.split('\n'))
    df = df.explode('paragraph')
    df['paragraph_len'] = df['paragraph'].apply(lambda x: len(x)) 
    df['paragraph_word_count'] = df['paragraph'].apply(lambda x: len(x.split(' ')))
    df = df[df.paragraph_len!=0].reset_index(drop=True)
    
    paragraph_agg_df = pd.concat([df[['id','paragraph_len']].groupby(['id']).agg(AGGREGATIONS), 
                                  df[['id','paragraph_word_count']].groupby(['id']).agg(AGGREGATIONS)], axis=1) 
    paragraph_agg_df.columns = ['_'.join(x) for x in paragraph_agg_df.columns]
    paragraph_agg_df['id'] = paragraph_agg_df.index
    paragraph_agg_df = paragraph_agg_df.reset_index(drop=True)
    paragraph_agg_df.drop(columns=["paragraph_word_count_count"], inplace=True)
    paragraph_agg_df = paragraph_agg_df.rename(columns={"paragraph_len_count":"paragraph_count"})
    return paragraph_agg_df

def product_to_keys(logs, essays):
    essays['product_len'] = essays.essay.str.len()
    tmp_df = logs[logs.activity.isin(['Input', 'Remove/Cut'])].groupby(['id']).agg({'activity': 'count'}).reset_index().rename(columns={'activity': 'keys_pressed'})
    essays = essays.merge(tmp_df, on='id', how='left')
    essays['product_to_keys'] = essays['product_len'] / essays['keys_pressed']
    return essays[['id', 'product_to_keys']]

def get_keys_pressed_per_second(logs):
    temp_df = logs[logs['activity'].isin(['Input', 'Remove/Cut'])].groupby(['id']).agg(keys_pressed=('event_id', 'count')).reset_index()
    temp_df_2 = logs.groupby(['id']).agg(min_down_time=('down_time', 'min'), max_up_time=('up_time', 'max')).reset_index()
    temp_df = temp_df.merge(temp_df_2, on='id', how='left')
    temp_df['keys_per_second'] = temp_df['keys_pressed'] / ((temp_df['max_up_time'] - temp_df['min_down_time']) / 1000)
    return temp_df[['id', 'keys_per_second']]

In [27]:
train_logs    = pl.scan_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv')
train_feats   = dev_feats(train_logs)
train_feats   = train_feats.collect().to_pandas()

print('< Essay Reconstruction >')
train_logs             = train_logs.collect().to_pandas()
train_essays           = pd.read_csv('/kaggle/input/writing-quality-challenge-constructed-essays/train_essays_fast.csv')
train_feats            = train_feats.merge(word_feats(train_essays), on='id', how='left')
train_feats            = train_feats.merge(sent_feats(train_essays), on='id', how='left')
train_feats            = train_feats.merge(parag_feats(train_essays), on='id', how='left')
train_feats            = train_feats.merge(get_keys_pressed_per_second(train_logs), on='id', how='left')
train_feats            = train_feats.merge(product_to_keys(train_logs, train_essays), on='id', how='left')

keys=train_feats.keys().values
unique_cols=[key for key in keys if train_feats[key].nunique()<2]
print(f"unique_cols:{unique_cols}")
train_feats = train_feats.drop(columns=unique_cols)

print('< Mapping >')
train_scores   = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv')
data           = train_feats.merge(train_scores, on='id', how='left')
x              = data.drop(['id', 'score'], axis=1)
y              = data['score'].values

print(f'Number of features: {len(x.columns)}')

print('< Testing Data >')
test_logs   = pl.scan_csv('/kaggle/working/test_logs.csv')
test_feats  = dev_feats(test_logs)
test_feats  = test_feats.collect().to_pandas()

test_logs             = test_logs.collect().to_pandas()
test_essays           = test_essays_copy
test_feats            = test_feats.merge(word_feats(test_essays), on='id', how='left')
test_feats            = test_feats.merge(sent_feats(test_essays), on='id', how='left')
test_feats            = test_feats.merge(parag_feats(test_essays), on='id', how='left')
test_feats            = test_feats.merge(get_keys_pressed_per_second(test_logs), on='id', how='left')
test_feats            = test_feats.merge(product_to_keys(test_logs, test_essays), on='id', how='left')

test_feats = test_feats.drop(columns=unique_cols)

test_ids = test_feats['id'].values
testin_x = test_feats.drop(['id'], axis=1)

< Count by values features >
< Input words stats features >
< Numerical columns features >
< Categorical columns features >
< Idle time features >
< P-bursts features >
< R-bursts features >
< Essay Reconstruction >
unique_cols:['cursor_position_min']
< Mapping >
Number of features: 164
< Testing Data >
< Count by values features >
< Input words stats features >
< Numerical columns features >
< Categorical columns features >
< Idle time features >
< P-bursts features >
< R-bursts features >


In [28]:
# train_feats = pd.concat([train_feats, train_scores['score']], axis=1)

# generator = Sequential()
# generator.add(Dense(128, input_shape=(100,)))
# generator.add(LeakyReLU(alpha=0.01)) 
# generator.add(Dense(256))
# generator.add(LeakyReLU(alpha=0.01))
# generator.add(Dense(165)) 

# discriminator = Sequential()
# discriminator.add(Dense(256, input_shape=(165,)))
# discriminator.add(LeakyReLU(alpha=0.01))
# discriminator.add(Dense(128))
# discriminator.add(LeakyReLU(alpha=0.01))
# discriminator.add(Dense(1))

# gan_model = Sequential()
# gan_model.add(generator)
# discriminator.trainable = False
# gan_model.add(discriminator)

# discriminator.compile(loss=MeanSquaredError(), optimizer=Adam(learning_rate=0.0002, beta_1=0.5))
# gan_model.compile(loss=MeanSquaredError(), optimizer=Adam(learning_rate=0.0002, beta_1=0.5))

# epochs = 1000
# batch_size = 128

# for epoch in range(epochs):
#     idx = np.random.randint(0, train_feats.shape[0], batch_size)
#     real_samples = train_feats.drop(columns=['id']).iloc[idx].values
#     fake_samples = generator.predict(np.random.rand(batch_size, 100))
#     real_labels = real_samples 
#     fake_labels = fake_samples
#     real_samples_tf = tf.convert_to_tensor(real_samples, dtype=tf.float32)
#     fake_samples_tf = tf.convert_to_tensor(fake_samples, dtype=tf.float32)
    
    
#     d_loss_real = discriminator.train_on_batch(real_samples_tf, real_labels)
#     d_loss_fake = discriminator.train_on_batch(fake_samples_tf, fake_labels )
#     d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

#     noise = np.random.rand(batch_size, 100)
#     valid_labels = noise
#     g_loss = gan_model.train_on_batch(noise, valid_labels)

#     if epoch % 100 == 0:
#         print(f"Epoch {epoch}, D Loss: {d_loss}, G_Loss: {g_loss}")
        

# generated_feats = generator.predict(np.random.rand(2471,100))

# generated_ids = range(1, len(generated_feats) + 1)
# df_generated = pd.DataFrame(generated_feats, columns=train_feats.columns[1:])
# df_generated['id'] = [f"{i:08d}" for i in generated_ids]

# train_feats = pd.concat([train_feats, df_generated], axis=0, ignore_index=True)

In [29]:
class ModelTrainer():
    def __init__(self, model_name, **params):
        # Model
        self.model_name = model_name
        self.params = params
        self.create_model()
        
        self.X = x
        self.Y = y       
        print(f'Number of features: {len(self.X.columns)}')
        
    
    def make_pipeline(self, model):
        return Pipeline([
            ('remove_infs', FunctionTransformer(lambda x: np.nan_to_num(x, nan=np.nan, posinf=0, neginf=0))),
            ('imputer', SimpleImputer(strategy='mean')),
            ('normalizer', FunctionTransformer(lambda x: np.log1p(np.abs(x)))),
            ('scaler', RobustScaler()),
            ('model', model)
        ])
    
    # Create the model
    def create_model(self):
        match model_name:
            case "lgbm":
                self.model = LGBMRegressor(**self.params)
            case "xgb":
                self.model = XGBRegressor(**self.params)
            case "catboost":
                self.model = CatBoostRegressor(**self.params)
            case 'rfr':
                self.model = self.make_pipeline(RandomForestRegressor(**self.params))
            case "svr":
                self.model = self.make_pipeline(SVR(**self.params))
            case 'lasso':
                self.model = self.make_pipeline(Lasso(**self.params))
            case 'ridge':
                self.model = self.make_pipeline(Ridge(**self.params))
            case other:
                print("Not implemented")
                sys.exit(-1)
    
    # Get the trained model        
    def get_model(self):
        return self.model
        
    # Train the model with 5-fold CV
    def train_model(self):        
        early_stopping_callback = lgb.early_stopping(200, first_metric_only=True, verbose=False)
        verbose_callback = lgb.log_evaluation(100)        
        # Split the training data into 5 fold
        skf = StratifiedKFold(n_splits=N_FOLD, random_state=SEED, shuffle=True)
        fold_rmses = []
        for fold, (train_index, valid_index) in enumerate(skf.split(self.X, self.Y.astype(str))):
            train_x = self.X.iloc[train_index]
            train_y = self.Y[train_index]
            valid_x = self.X.iloc[valid_index]
            valid_y = self.Y[valid_index]
            if model_name == 'lgbm':
                # Train the model with early stop of 100 
                self.model.fit(train_x, train_y, eval_set=[(valid_x, valid_y)],
                          callbacks=[
                                lgb.callback.early_stopping(stopping_rounds=100),
                                lgb.callback.log_evaluation(period=100),
                          ])  
            else:
                # Fit the model with train x and train y
                self.model.fit(train_x, train_y)            
            predictions = self.model.predict(valid_x)
            rmse = mean_squared_error(y_true=valid_y, y_pred=predictions, squared=False) # Return RMSE
            fold_rmses.append(rmse)
        avg_rmse = np.mean(fold_rmses)
        print(f"Average rmse: {avg_rmse}") 
        return avg_rmse
    
    # Evaluate the model with entire X data
    def evaluation(self):
        preds = self.predict(self.X)
        rmse = mean_squared_error(y_true=self.Y, y_pred=preds, squared=False)
        return rmse
        
    # Predict the test data. 
    def predict(self, test_x):
        # Prediction loop
        tests_y = np.zeros((len(test_x), N_FOLD))
        for fold in range(N_FOLD):
            preds = self.model.predict(test_x)
            tests_y[:, fold] = preds
            #print(f"Fold = {fold} Prediction = {preds[:5]}")
        test_y = np.mean(tests_y, axis=1)
        return test_y# Average the prediction of each fold model
    
    # Clear the memory
    def clear_memory(self):
        del self.model
        libc.malloc_trim(0)
        torch.cuda.empty_cache()
        gc.collect()


In [30]:
params_dict ={}
# CatBoostRegressor
params_dict['catboost'] =  {
    "iterations": 5000,
    "early_stopping_rounds": 50,
    "depth": 6,
    "loss_function": "RMSE",
    "random_seed": SEED,
    "silent": True
}

## Best parameters of LGBM
params_dict['lgbm'] = {
    'n_estimators': 1024,
    'learning_rate': 0.005,
    'metric': 'rmse',
    'random_state': SEED,
    'force_col_wise': True,
    'verbosity': 0,
}

# XGBRegressor
params_dict['xgb'] = {
    "max_depth": 4,
    "learning_rate": 0.1,
    "objective": "reg:squarederror",
    "num_estimators": 1000,
    "num_boost_round": 1000,
    "eval_metric": "rmse",
    "seed": SEED
}
# svr
params_dict['svr'] = {
    'kernel':'rbf',
    'C':1.0,
    'epsilon': 0.1
}
# rfr 
params_dict['rfr'] = {
    'max_depth': 6,
    'max_features': 'sqrt',
    'min_impurity_decrease': 0.0016295128631816343,
    'n_estimators': 200,
    'random_state': SEED,
    }
# Ridge
params_dict['ridge'] = {
    'alpha': 1,
    'random_state': SEED,
    'solver': 'auto'
    }
# Lasso
params_dict['lasso'] = {
    'alpha': 0.04198227921905038, 
    'max_iter': 2000, 
    'random_state': SEED,
    }

In [31]:
best_score = 1.0
# Find the optimal learning rate
def objective(trial, model_name):
    global params_dict
    # Parameters
    params = params_dict[model_name] # Load the default parameters
    # set the trial for tunable parameters
    if model_name == 'xgb':
        # Parameters for 'xgb' model
        params['learning_rate'] = trial.suggest_loguniform('learning_rate', 1e-4, 0.5)
        params['max_depth'] = trial.suggest_int('max_depth', 2, 64)
    elif model_name == 'catboost':
        params['depth'] = trial.suggest_int('depth', 2, 30)
    elif model_name == 'svr':
        params['epsilon'] = trial.suggest_float('epsilon', 0.01, 1)
    elif model_name == 'ridge':
        params['alpha'] = trial.suggest_loguniform('alpha', 1e-3, 10.0)
    elif model_name == 'lgbm':
        params['learning_rate'] = trial.suggest_loguniform('learning_rate', 1e-4, 0.5)
        params['reg_alpha'] = trial.suggest_loguniform('reg_alpha', 1e-3, 10.0)
        params['reg_lambda'] = trial.suggest_loguniform('reg_lambda', 1e-3, 10.0)
        params['colsample_bytree'] = trial.suggest_float('colsample_bytree', 0.5, 1)
        params['subsample'] = trial.suggest_float('subsample', 0.5, 1)
        params['num_leaves'] = trial.suggest_int('num_leaves', 8, 64)
        params['min_child_samples'] = trial.suggest_int('min_child_samples', 1, 100)
    # Experiment the parameters
    trainer = ModelTrainer(model_name, **params)
    avg_score = trainer.train_model()
    # Save the model is the avg score > current best score
    global best_score
    if avg_score < best_score:
        best_score = avg_score
    # Clean up
    trainer.clear_memory()
    del trainer    
    print(f"Average result {avg_score} and the best score {best_score}")
    return avg_score

def run_optuna(model_name):
    study_name = f"{model_name}_study"
    study_file_path = f"/kaggle/working/{study_name}.db"
    if os.path.exists(study_file_path):
        os.remove(study_file_path)
    # # Create a study to find the optimal hyper-parameters    
    study = optuna.create_study(direction="minimize", study_name=study_name,
                                storage="sqlite:///" + f"{study_file_path}", # Storage path of the database keeping the study results
                                load_if_exists=False)  # Resume the existing study
    # Set up the timeout to avoid runing out of quote
    # n_jobs =-1 is CPU bounded
    study.optimize(lambda trial: objective(trial, model_name), 
                   n_jobs=4, n_trials=1000,
                   show_progress_bar=True, gc_after_trial=True)
    ## Print the best parameters    
    best_trial = study.best_trial
    best_params = study.best_params
    # Print out the experiment results
    print(f"Best parameters: {best_params}\n\n"
          f"Number of finished trials: {len(study.trials)}\n\n"
          f"Best trial:{best_trial}")    
    return study

In [32]:
# Train the model and make the predictions
def train_model(model_name, is_loaded=True):
    best_params = params_dict[model_name]
    # If is_loaded is True, load the best parameters.
    # Otherwise, initiate an Optuna study to optimize parameters.
    if is_loaded:  # Loaded the best parameters that are found from previous experiments
        study_name = f"{model_name}_study"
        study_file_path = f"/kaggle/input/writing-quality-dataset/{study_name}.db"
        if os.path.isfile(study_file_path):
            loaded_study = optuna.load_study(study_name=study_name,
                                         storage="sqlite:///" + f"{study_file_path}")
            best_params.update(loaded_study.best_params)
            print(f"Best parameters: {best_params}\n\n")
    else:
        study = run_optuna(model_name)
        best_params.update(study.best_params)
        # Print out the experiment results
        print(f"Best parameters: {best_params}\n\n")
    ## Parameters for LGBMRegressor model
    trainer = ModelTrainer(model_name, **best_params)
    trainer.train_model()
    rmse = trainer.evaluation()
    model = trainer.get_model()
    print(f"Complete training {model_name} RMSE = {rmse}")
    return model

# Collect all the models
models = []
model_names = ['ridge', 'svr', 'catboost', 'lgbm', 'xgb'] # 5 models 
# model_names = ['lasso', 'ridge', 'rfr', 'svr', 'catboost', 'lgbm', 'xgb'] # 7 models
preds_y = []
tests_y = []
for model_name in model_names:
    is_loaded = True
#     if 'lgbm' == model_name: # Enable optuna
#         is_loaded = False
    model = train_model(model_name, is_loaded)
    models.append((model_name, model))
print(models)

Number of features: 164
Average rmse: 0.6288597873858064
Complete training ridge RMSE = 0.591842865049595
Number of features: 164
Average rmse: 0.6443191872700945
Complete training svr RMSE = 0.5196104868817799
Number of features: 164
Average rmse: 0.6211000678151847
Complete training catboost RMSE = 0.3092264620227419
Number of features: 164
Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.803681
[200]	valid_0's rmse: 0.69716
[300]	valid_0's rmse: 0.647156
[400]	valid_0's rmse: 0.623434
[500]	valid_0's rmse: 0.611678
[600]	valid_0's rmse: 0.605905
[700]	valid_0's rmse: 0.603699
[800]	valid_0's rmse: 0.602694
Early stopping, best iteration is:
[788]	valid_0's rmse: 0.602495
Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.803785
[200]	valid_0's rmse: 0.693624
[300]	valid_0's rmse: 0.644339
[400]	valid_0's rmse: 0.620483
[500]	valid_0's rmse: 0.610325
[600]	valid_0's rmse: 0.604234
[700]	valid_0's rmse: 0.601556
[

In [33]:
def evaluate_models(models, x, y):
    # split the full train data (data_X and data_Y) into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(x, y,
                                                      test_size=0.2, random_state=SEED)
    # fit and evaluate the models
    weights = list()
    for name, model in models:
        # fit the model
        model.fit(X_train, y_train)
        # evaluate the model
        y_preds = model.predict(X_val)
        # Calculate the 
        rmse = mean_squared_error(y_true=y_val, y_pred=y_preds, squared=False)
        # store the performance
        weights.append(rmse)
    # report model performance
    print(f"Weight = {weights}")
    return weights

In [34]:
try:
    weights = evaluate_models(models, x, y)
    # Use the weights (scores) as a weighting for the ensemble
    ensemble = VotingRegressor(estimators=models, weights=weights)
    ensemble.fit(x, y)
    test_y = ensemble.predict(testin_x)
    print(test_y)
except Exception as e: 
    print(e)

Weight = [0.561877767961382, 0.5787348739568923, 0.5442215367246644, 0.5413674268821377, 0.5367460131530352]
[1.13043868 0.34338526 0.22959284]


In [35]:
sub2= pd.DataFrame({'id': test_ids, 'score': test_y})
sub2.to_csv('submission.csv', index=False)
sub2

Unnamed: 0,id,score
0,0000aaaa,1.130439
1,2222bbbb,0.343385
2,4444cccc,0.229593


# TOKENIZATION


In [36]:
def getEssays(df):
  
    # 'id', 'activity', 'cursor_position', 'text_change' 열만 선택한 DataFrame 복사
    textInputDf = df[['id', 'activity', 'cursor_position', 'text_change']].copy()
    
    # 'activity' 열에서 'Nonproduction'인 행을 제외
    textInputDf = textInputDf[textInputDf.activity != 'Nonproduction']

    # 각 'id'별로 발생한 활동 수를 계산하여 배열로 저장
    valCountsArr = textInputDf['id'].value_counts(sort=False).values

    lastIndex = 0

    # 결과를 저장할 Pandas Series 생성
    essaySeries = pd.Series()

    for index, valCount in enumerate(valCountsArr):

        currTextInput = textInputDf[['activity', 'cursor_position', 'text_change']].iloc[lastIndex : lastIndex + valCount]
        lastIndex += valCount
        essayText = ""

        for Input in currTextInput.values:
            
            # Input[0] = activity
            # Input[2] = cursor_position
            # Input[3] = text_change
            
            if Input[0] == 'Replace':
                # '=>' 문자열을 기준으로 text_change를 분할
                replaceTxt = Input[2].split(' => ')
                
                # DONT TOUCH
                essayText = essayText[:Input[1] - len(replaceTxt[1])] + replaceTxt[1] + essayText[Input[1] - len(replaceTxt[1]) + len(replaceTxt[0]):]
                continue

                
            # If activity = Paste    
            if Input[0] == 'Paste':
                # DONT TOUCH
                essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]
                continue

                
            # If activity = Remove/Cut
            if Input[0] == 'Remove/Cut':
                # DONT TOUCH
                essayText = essayText[:Input[1]] + essayText[Input[1] + len(Input[2]):]
                continue

                
            # If activity = Move...
            if "M" in Input[0]:
                # "Move from to" 텍스트를 제거
                croppedTxt = Input[0][10:]
                
                # ' To '를 기준으로 문자열을 분할
                splitTxt = croppedTxt.split(' To ')
                
                # 문자열을 다시 ', '를 기준으로 분할하여 배열로 저장
                valueArr = [item.split(', ') for item in splitTxt]
                
                # Move from [2, 4] To [5, 7] = (2, 4, 5, 7)
                moveData = (int(valueArr[0][0][1:]), int(valueArr[0][1][:-1]), int(valueArr[1][0][1:]), int(valueArr[1][1][:-1]))

                # 같은 위치로 이동하는 경우 건너뛰기
                if moveData[0] != moveData[2]:
                    # 텍스트를 앞으로 이동시키는 경우 (다른 경우)
                    if moveData[0] < moveData[2]:
                        # DONT TOUCH
                        essayText = essayText[:moveData[0]] + essayText[moveData[1]:moveData[3]] + essayText[moveData[0]:moveData[1]] + essayText[moveData[3]:]
                    else:
                        # DONT TOUCH
                        essayText = essayText[:moveData[2]] + essayText[moveData[0]:moveData[1]] + essayText[moveData[2]:moveData[0]] + essayText[moveData[1]:]
                continue
                         
            # If just input
            # DONT TOUCH
            essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]

        # 결과 시리즈의 해당 인덱스에 에세이 텍스트를 설정  
        essaySeries[index] = essayText
     
    # 결과 시리즈의 인덱스를 고유한 'id' 값으로 설정
    essaySeries.index =  textInputDf['id'].unique()
    
    return essaySeries

In [37]:
traindf = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv')
train_scores = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv')
testdf = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv')

In [38]:
%%time
train_essays = getEssays(traindf)

CPU times: user 8min 24s, sys: 3min 33s, total: 11min 57s
Wall time: 11min 58s


In [39]:
%%time
test_essays = getEssays(testdf)

CPU times: user 6.81 ms, sys: 0 ns, total: 6.81 ms
Wall time: 6.84 ms


In [40]:
train_essaysdf = pd.DataFrame({'id': train_essays.index, 'essay': train_essays.values})
test_essaysdf = pd.DataFrame({'id': test_essays.index, 'essay': test_essays.values})

merged_data = train_essaysdf.merge(train_scores, on='id')

In [41]:
from sklearn.feature_extraction.text import CountVectorizer


In [42]:
count_vectorizer = CountVectorizer(ngram_range=(1, 2))
X_tokenizer_train = count_vectorizer.fit_transform(merged_data['essay'])
X_tokenizer_test = count_vectorizer.transform(test_essaysdf['essay'])
count_vectorizer.get_feature_names_out() #ADDED
y = merged_data['score']

In [43]:
df_train = pd.DataFrame()
df_test = pd.DataFrame()

In [44]:
X_tokenizer_train = X_tokenizer_train.todense()
X_tokenizer_test = X_tokenizer_test.todense()

for i in range(X_tokenizer_train.shape[1]) : 
    L = list(X_tokenizer_train[:,i])
    li = [int(x) for x in L ]
    df_train[f'feature {i}'] = li
    
for i in range(X_tokenizer_test.shape[1]) : 
    L = list(X_tokenizer_test[:,i])
    li = [int(x) for x in L ]
    df_test[f'feature {i}'] = li    

In [45]:
df_train_index = train_essaysdf['id']
df_test_index = test_essaysdf['id']

df_train.loc[:, 'id'] = df_train_index
df_test.loc[:, 'id'] = df_test_index

In [46]:
train_agg_fe_df = traindf.groupby("id")[['down_time', 'up_time', 'action_time', 'cursor_position', 'word_count']].agg(['mean', 'std', 'min', 'max', 'last', 'first', 'sem', 'median', 'sum'])
train_agg_fe_df.columns = ['_'.join(x) for x in train_agg_fe_df.columns]
train_agg_fe_df = train_agg_fe_df.add_prefix("tmp_")
train_agg_fe_df.reset_index(inplace=True)

test_agg_fe_df = testdf.groupby("id")[['down_time', 'up_time', 'action_time', 'cursor_position', 'word_count']].agg(['mean', 'std', 'min', 'max', 'last', 'first', 'sem', 'median', 'sum'])
test_agg_fe_df.columns = ['_'.join(x) for x in test_agg_fe_df.columns]
test_agg_fe_df = test_agg_fe_df.add_prefix("tmp_")
test_agg_fe_df.reset_index(inplace=True)

In [47]:
from collections import defaultdict

class Preprocessor:
    
    def __init__(self, seed):
        self.seed = seed
        
        self.activities = ['Input', 'Remove/Cut', 'Nonproduction', 'Replace', 'Paste']
        self.events = ['q', 'Space', 'Backspace', 'Shift', 'ArrowRight', 'Leftclick', 'ArrowLeft', '.', ',', 
              'ArrowDown', 'ArrowUp', 'Enter', 'CapsLock', "'", 'Delete', 'Unidentified']
        self.text_changes = ['q', ' ', 'NoChange', '.', ',', '\n', "'", '"', '-', '?', ';', '=', '/', '\\', ':']
        self.punctuations = ['"', '.', ',', "'", '-', ';', ':', '?', '!', '<', '>', '/',
                        '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+']
        self.gaps = [1, 2, 3, 5, 10, 20, 50, 100]
        
        self.idf = defaultdict(float)
#         self.gaps = [1, 2]
    
    def activity_counts(self, df):
        tmp_df = df.groupby('id').agg({'activity': list}).reset_index()
        ret = list()
        for li in tqdm(tmp_df['activity'].values):
            items = list(Counter(li).items())
            di = dict()
            for k in self.activities:
                di[k] = 0
            for item in items:
                k, v = item[0], item[1]
                if k in di:
                    di[k] = v
            ret.append(di)
        ret = pd.DataFrame(ret)
        cols = [f'activity_{i}_count' for i in range(len(ret.columns))]
        ret.columns = cols

        cnts = ret.sum(1)

        for col in cols:
            if col in self.idf.keys():
                idf = self.idf[col]
            else:
                idf = df.shape[0] / (ret[col].sum() + 1)
                idf = np.log(idf)
                self.idf[col] = idf

            ret[col] = 1 + np.log(ret[col] / cnts)
            ret[col] *= idf

        return ret


    def event_counts(self, df, colname):
        tmp_df = df.groupby('id').agg({colname: list}).reset_index()
        ret = list()
        for li in tqdm(tmp_df[colname].values):
            items = list(Counter(li).items())
            di = dict()
            for k in self.events:
                di[k] = 0
            for item in items:
                k, v = item[0], item[1]
                if k in di:
                    di[k] = v
            ret.append(di)
        ret = pd.DataFrame(ret)
        cols = [f'{colname}_{i}_count' for i in range(len(ret.columns))]
        ret.columns = cols

        cnts = ret.sum(1)

        for col in cols:
            if col in self.idf.keys():
                idf = self.idf[col]
            else:
                idf = df.shape[0] / (ret[col].sum() + 1)
                idf = np.log(idf)
                self.idf[col] = idf
            
            ret[col] = 1 + np.log(ret[col] / cnts)
            ret[col] *= idf

        return ret


    def text_change_counts(self, df):
        tmp_df = df.groupby('id').agg({'text_change': list}).reset_index()
        ret = list()
        for li in tqdm(tmp_df['text_change'].values):
            items = list(Counter(li).items())
            di = dict()
            for k in self.text_changes:
                di[k] = 0
            for item in items:
                k, v = item[0], item[1]
                if k in di:
                    di[k] = v
            ret.append(di)
        ret = pd.DataFrame(ret)
        cols = [f'text_change_{i}_count' for i in range(len(ret.columns))]
        ret.columns = cols

        cnts = ret.sum(1)

        for col in cols:
            if col in self.idf.keys():
                idf = self.idf[col]
            else:
                idf = df.shape[0] / (ret[col].sum() + 1)
                idf = np.log(idf)
                self.idf[col] = idf
            
            ret[col] = 1 + np.log(ret[col] / cnts)
            ret[col] *= idf
            
        return ret

    def match_punctuations(self, df):
        tmp_df = df.groupby('id').agg({'down_event': list}).reset_index()
        ret = list()
        for li in tqdm(tmp_df['down_event'].values):
            cnt = 0
            items = list(Counter(li).items())
            for item in items:
                k, v = item[0], item[1]
                if k in self.punctuations:
                    cnt += v
            ret.append(cnt)
        ret = pd.DataFrame({'punct_cnt': ret})
        return ret


    def get_input_words(self, df):
        tmp_df = df[(~df['text_change'].str.contains('=>'))&(df['text_change'] != 'NoChange')].reset_index(drop=True)
        tmp_df = tmp_df.groupby('id').agg({'text_change': list}).reset_index()
        tmp_df['text_change'] = tmp_df['text_change'].apply(lambda x: ''.join(x))
        tmp_df['text_change'] = tmp_df['text_change'].apply(lambda x: re.findall(r'q+', x))
        tmp_df['input_word_count'] = tmp_df['text_change'].apply(len)
        tmp_df['input_word_length_mean'] = tmp_df['text_change'].apply(lambda x: np.mean([len(i) for i in x] if len(x) > 0 else 0))
        tmp_df['input_word_length_max'] = tmp_df['text_change'].apply(lambda x: np.max([len(i) for i in x] if len(x) > 0 else 0))
        tmp_df['input_word_length_std'] = tmp_df['text_change'].apply(lambda x: np.std([len(i) for i in x] if len(x) > 0 else 0))
        tmp_df.drop(['text_change'], axis=1, inplace=True)
        return tmp_df
    
    def make_feats(self, df):
        
        print("Starting to engineer features")
        
        # initialize features dataframe
        feats = pd.DataFrame({'id': df['id'].unique().tolist()})
        
        # get shifted features
        # time shift
        print("Engineering time data")
        for gap in self.gaps:
            print(f"> for gap {gap}")
            df[f'up_time_shift{gap}'] = df.groupby('id')['up_time'].shift(gap)
            df[f'action_time_gap{gap}'] = df['down_time'] - df[f'up_time_shift{gap}']
        df.drop(columns=[f'up_time_shift{gap}' for gap in self.gaps], inplace=True)

        # cursor position shift
        print("Engineering cursor position data")
        for gap in self.gaps:
            print(f"> for gap {gap}")
            df[f'cursor_position_shift{gap}'] = df.groupby('id')['cursor_position'].shift(gap)
            df[f'cursor_position_change{gap}'] = df['cursor_position'] - df[f'cursor_position_shift{gap}']
            df[f'cursor_position_abs_change{gap}'] = np.abs(df[f'cursor_position_change{gap}'])
        df.drop(columns=[f'cursor_position_shift{gap}' for gap in self.gaps], inplace=True)

        # word count shift
        print("Engineering word count data")
        for gap in self.gaps:
            print(f"> for gap {gap}")
            df[f'word_count_shift{gap}'] = df.groupby('id')['word_count'].shift(gap)
            df[f'word_count_change{gap}'] = df['word_count'] - df[f'word_count_shift{gap}']
            df[f'word_count_abs_change{gap}'] = np.abs(df[f'word_count_change{gap}'])
        df.drop(columns=[f'word_count_shift{gap}' for gap in self.gaps], inplace=True)
        
        # get aggregate statistical features
        print("Engineering statistical summaries for features")
        # [(feature name, [ stat summaries to add ])]
        feats_stat = [
            ('event_id', ['max']),
            ('up_time', ['max']),
            ('action_time', ['max', 'min', 'mean', 'std', 'quantile', 'sem', 'sum', 'skew']),
            ('activity', ['nunique']),
            ('down_event', ['nunique']),
            ('up_event', ['nunique']),
            ('text_change', ['nunique']),
            ('cursor_position', ['nunique', 'max', 'quantile', 'sem', 'mean']),
            ('word_count', ['nunique', 'max', 'quantile', 'sem', 'mean'])]
        for gap in self.gaps:
            feats_stat.extend([
                (f'action_time_gap{gap}', ['max', 'min', 'mean', 'std', 'quantile', 'sem', 'sum', 'skew']),
                (f'cursor_position_change{gap}', ['max', 'mean', 'std', 'quantile', 'sem', 'sum', 'skew']),
                (f'word_count_change{gap}', ['max', 'mean', 'std', 'quantile', 'sem', 'sum', 'skew'])
            ])
        
        pbar = tqdm(feats_stat)
        for item in pbar:
            colname, methods = item[0], item[1]
            for method in methods:
                pbar.set_postfix()
                if isinstance(method, str):
                    method_name = method
                else:
                    method_name = method.__name__
                    
                pbar.set_postfix(column=colname, method=method_name)
                tmp_df = df.groupby(['id']).agg({colname: method}).reset_index().rename(columns={colname: f'{colname}_{method_name}'})
                feats = feats.merge(tmp_df, on='id', how='left')

        # counts
        print("Engineering activity counts data")
        tmp_df = self.activity_counts(df)
        feats = pd.concat([feats, tmp_df], axis=1)
        
        print("Engineering event counts data")
        tmp_df = self.event_counts(df, 'down_event')
        feats = pd.concat([feats, tmp_df], axis=1)
        tmp_df = self.event_counts(df, 'up_event')
        feats = pd.concat([feats, tmp_df], axis=1)
        
        print("Engineering text change counts data")
        tmp_df = self.text_change_counts(df)
        feats = pd.concat([feats, tmp_df], axis=1)
        
        print("Engineering punctuation counts data")
        tmp_df = self.match_punctuations(df)
        feats = pd.concat([feats, tmp_df], axis=1)

        # input words
        print("Engineering input words data")
        tmp_df = self.get_input_words(df)
        feats = pd.merge(feats, tmp_df, on='id', how='left')

        # compare feats
        print("Engineering ratios data")
        feats['word_time_ratio'] = feats['word_count_max'] / feats['up_time_max']
        feats['word_event_ratio'] = feats['word_count_max'] / feats['event_id_max']
        feats['event_time_ratio'] = feats['event_id_max']  / feats['up_time_max']
        feats['idle_time_ratio'] = feats['action_time_gap1_sum'] / feats['up_time_max']
        
        print("Done!")
        return feats

In [48]:
from tqdm import tqdm


In [49]:
preprocessor = Preprocessor(seed=42)

print("Engineering features for training data")

other_train_feats = preprocessor.make_feats(traindf)

print()
print("-"*25)
print("Engineering features for test data")
print("-"*25)
other_test_feats = preprocessor.make_feats(testdf)

Engineering features for training data
Starting to engineer features
Engineering time data
> for gap 1
> for gap 2
> for gap 3
> for gap 5
> for gap 10
> for gap 20
> for gap 50
> for gap 100
Engineering cursor position data
> for gap 1
> for gap 2
> for gap 3
> for gap 5
> for gap 10
> for gap 20
> for gap 50
> for gap 100
Engineering word count data
> for gap 1
> for gap 2
> for gap 3
> for gap 5
> for gap 10
> for gap 20
> for gap 50
> for gap 100
Engineering statistical summaries for features


100%|██████████| 33/33 [04:20<00:00,  7.90s/it, column=word_count_change100, method=skew]


Engineering activity counts data


100%|██████████| 2471/2471 [00:00<00:00, 4938.76it/s]


Engineering event counts data


100%|██████████| 2471/2471 [00:00<00:00, 4406.80it/s]
100%|██████████| 2471/2471 [00:00<00:00, 4802.46it/s]


Engineering text change counts data


100%|██████████| 2471/2471 [00:00<00:00, 4835.59it/s]


Engineering punctuation counts data


100%|██████████| 2471/2471 [00:00<00:00, 4807.58it/s]


Engineering input words data
Engineering ratios data
Done!

-------------------------
Engineering features for test data
-------------------------
Starting to engineer features
Engineering time data
> for gap 1
> for gap 2
> for gap 3
> for gap 5
> for gap 10
> for gap 20
> for gap 50
> for gap 100
Engineering cursor position data
> for gap 1
> for gap 2
> for gap 3
> for gap 5
> for gap 10
> for gap 20
> for gap 50
> for gap 100
Engineering word count data
> for gap 1
> for gap 2
> for gap 3
> for gap 5
> for gap 10
> for gap 20
> for gap 50
> for gap 100
Engineering statistical summaries for features


100%|██████████| 33/33 [00:01<00:00, 17.62it/s, column=word_count_change100, method=skew]


Engineering activity counts data


100%|██████████| 3/3 [00:00<00:00, 19388.15it/s]


Engineering event counts data


100%|██████████| 3/3 [00:00<00:00, 17476.27it/s]
100%|██████████| 3/3 [00:00<00:00, 19269.39it/s]


Engineering text change counts data


100%|██████████| 3/3 [00:00<00:00, 17722.41it/s]


Engineering punctuation counts data


100%|██████████| 3/3 [00:00<00:00, 20004.63it/s]


Engineering input words data
Engineering ratios data
Done!


In [50]:
df_train_all = pd.DataFrame()
df_test_all = pd.DataFrame()

df_train_all = df_train.merge(train_agg_fe_df,on='id')
df_test_all = df_test.merge(test_agg_fe_df,on='id')

In [51]:
def q1(x):
    return x.quantile(0.25)
def q3(x):
    return x.quantile(0.75)

In [52]:
AGGREGATIONS = ['count', 'mean', 'std', 'min', 'max', 'first', 'last', 'sem', q1, 'median', q3, 'skew', 'sum']

def split_essays_into_sentences(df):
    essay_df = df
    essay_df['id'] = essay_df.index
    essay_df['sent'] = essay_df['essay'].apply(lambda x: re.split('\\.|\\?|\\!',str(x)))
    essay_df = essay_df.explode('sent')
    essay_df['sent'] = essay_df['sent'].apply(lambda x: x.replace('\n','').strip())
    # Number of characters in sentences
    essay_df['sent_len'] = essay_df['sent'].apply(lambda x: len(x))
    # Number of words in sentences
    essay_df['sent_word_count'] = essay_df['sent'].apply(lambda x: len(x.split(' ')))
    essay_df = essay_df[essay_df.columns.tolist()].reset_index(drop=True)
    return essay_df

def compute_sentence_aggregations(df):
    sent_agg_df = pd.concat(
        [df[['id','sent_len']].groupby(['id']).agg(AGGREGATIONS), df[['id','sent_word_count']].groupby(['id']).agg(AGGREGATIONS)], axis=1
    )
    sent_agg_df.columns = ['_'.join(x) for x in sent_agg_df.columns]
    sent_agg_df['id'] = sent_agg_df.index
    sent_agg_df = sent_agg_df.reset_index(drop=True)
    sent_agg_df.drop(columns=["sent_word_count_count"], inplace=True)
    sent_agg_df = sent_agg_df.rename(columns={"sent_len_count":"sent_count"})
    return sent_agg_df

def split_essays_into_paragraphs(df):
    essay_df = df
    essay_df['id'] = essay_df.index
    essay_df['paragraph'] = essay_df['essay'].apply(lambda x: str(x).split('\n'))
    essay_df = essay_df.explode('paragraph')
    # Number of characters in paragraphs
    essay_df['paragraph_len'] = essay_df['paragraph'].apply(lambda x: len(x)) 
    # Number of words in paragraphs
    essay_df['paragraph_word_count'] = essay_df['paragraph'].apply(lambda x: len(x.split(' ')))
    essay_df = essay_df[essay_df.paragraph_len!=0].reset_index(drop=True)
    return essay_df

def compute_paragraph_aggregations(df):
    paragraph_agg_df = pd.concat(
        [df[['id','paragraph_len']].groupby(['id']).agg(AGGREGATIONS), df[['id','paragraph_word_count']].groupby(['id']).agg(AGGREGATIONS)], axis=1
    ) 
    paragraph_agg_df.columns = ['_'.join(x) for x in paragraph_agg_df.columns]
    paragraph_agg_df['id'] = paragraph_agg_df.index
    paragraph_agg_df = paragraph_agg_df.reset_index(drop=True)
    paragraph_agg_df.drop(columns=["paragraph_word_count_count"], inplace=True)
    paragraph_agg_df = paragraph_agg_df.rename(columns={"paragraph_len_count":"paragraph_count"})
    return paragraph_agg_df

In [53]:
train_sent_df = split_essays_into_sentences(train_essaysdf)
train_sent_agg_df = compute_sentence_aggregations(train_sent_df)

train_paragraph_df = split_essays_into_paragraphs(train_essaysdf)
train_paragraph_agg_df = compute_paragraph_aggregations(train_paragraph_df)

test_sent_agg_df = compute_sentence_aggregations(split_essays_into_sentences(test_essaysdf))
test_paragraph_agg_df = compute_paragraph_aggregations(split_essays_into_paragraphs(test_essaysdf))

train_paragraph_agg_df.loc[:, 'id'] = df_train_index
train_sent_agg_df.loc[:, 'id'] = df_train_index

test_paragraph_agg_df.loc[:, 'id'] = df_test_index
test_sent_agg_df.loc[:, 'id'] = df_test_index

In [54]:
new_train_feats = pd.DataFrame()
new_test_feats = pd.DataFrame()

new_train_feats = train_paragraph_agg_df.merge(df_train_all,on='id')
new_train_feats = new_train_feats.merge(train_sent_agg_df,on='id')

new_test_feats = test_paragraph_agg_df.merge(df_test_all,on='id')
new_test_feats = new_test_feats.merge(test_sent_agg_df,on='id')

train_feats = pd.DataFrame()
test_feats = pd.DataFrame()

train_feats = new_train_feats.merge(other_train_feats,on='id')
test_feats = new_test_feats.merge(other_test_feats,on='id')

In [55]:
data = []

for logs in [traindf, testdf]:
    logs['up_time_lagged'] = logs.groupby('id')['up_time'].shift(1).fillna(logs['down_time'])
    logs['time_diff'] = abs(logs['down_time'] - logs['up_time_lagged']) / 1000

    group = logs.groupby('id')['time_diff']
    largest_lantency = group.max()
    smallest_lantency = group.min()
    median_lantency = group.median()
    initial_pause = logs.groupby('id')['down_time'].first() / 1000
    pauses_half_sec = group.apply(lambda x: ((x > 0.5) & (x < 1)).sum())
    pauses_1_sec = group.apply(lambda x: ((x > 1) & (x < 1.5)).sum())
    pauses_1_half_sec = group.apply(lambda x: ((x > 1.5) & (x < 2)).sum())
    pauses_2_sec = group.apply(lambda x: ((x > 2) & (x < 3)).sum())
    pauses_3_sec = group.apply(lambda x: (x > 3).sum())

    data.append(pd.DataFrame({
        'id': logs['id'].unique(),
        'largest_lantency': largest_lantency,
        'smallest_lantency': smallest_lantency,
        'median_lantency': median_lantency,
        'initial_pause': initial_pause,
        'pauses_half_sec': pauses_half_sec,
        'pauses_1_sec': pauses_1_sec,
        'pauses_1_half_sec': pauses_1_half_sec,
        'pauses_2_sec': pauses_2_sec,
        'pauses_3_sec': pauses_3_sec,
    }).reset_index(drop=True))

train_eD592674, test_eD592674 = data

train_feats = train_feats.merge(train_eD592674, on='id', how='left')
test_feats = test_feats.merge(test_eD592674, on='id', how='left')
train_feats = train_feats.merge(train_scores, on='id', how='left')

In [56]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

train_feats['score_class'] = le.fit_transform(train_feats['score'])

In [57]:
target_col = ['score']

drop_cols = ['id', 'score_class']
train_cols = list()

train_cols = [col for col in train_feats.columns if col not in target_col + drop_cols]

In [58]:
nan_cols = train_feats.columns[train_feats.isna().any()].tolist()

# 결측치 최빈값으로 처리 
for col in nan_cols:
    mode_value_train = train_feats[col].mode()[0] #최빈값 여러 개 일 경우 첫 번째꺼 선택 
    train_feats[col].fillna(mode_value_train, inplace=True)
    
for col in test_feats.columns[test_feats.isna().any()].tolist():
    # Find the most frequent value in the training set for the current feature
    most_frequent_value_train = train_feats[col].mode()[0]
    
    # Fill missing values in the test set with the most frequent value from the training set
    test_feats[col].fillna(most_frequent_value_train, inplace=True)

train_feats.shape, test_feats.shape   

((2471, 659), (3, 657))

In [59]:
set(train_feats.columns) - set(test_feats.columns)

{'score', 'score_class'}

In [60]:
def clean_memory():
    import gc
    gc.collect()


In [61]:
clean_memory()

In [62]:
from sklearn import model_selection
from sklearn import metrics


In [63]:
models_dict = {}
scores = []

test_predict_list = []
best_params = {'boosting_type': 'gbdt', 
               'metric': 'rmse',
               'reg_alpha': 0.003188447814669599, 
               'reg_lambda': 0.0010228604507564066, 
               'colsample_bytree': 0.5420247656839267, 
               'subsample': 0.9778252382803456, 
               'feature_fraction': 0.8,
               'bagging_freq': 1,
               'bagging_fraction': 0.75,
               'learning_rate': 0.01716485155812008, 
               'num_leaves': 19, 
               'min_child_samples': 46,
               'verbosity': -1,
               'random_state': 42,
               'n_estimators': 500,
               'device_type': 'cpu'}

for i in range(5): 
    kf = model_selection.KFold(n_splits=10, random_state=42 + i, shuffle=True)

    oof_valid_preds = np.zeros(train_feats.shape[0], )

    X_test = test_feats[train_cols]


    for fold, (train_idx, valid_idx) in enumerate(kf.split(train_feats)):

        print("==-"* 50)
        print("Fold : ", fold)

        X_train, y_train = train_feats.iloc[train_idx][train_cols], train_feats.iloc[train_idx][target_col]
        X_valid, y_valid = train_feats.iloc[valid_idx][train_cols], train_feats.iloc[valid_idx][target_col]

        print("Trian :", X_train.shape, y_train.shape)
        print("Valid :", X_valid.shape, y_valid.shape)

        params = {
            "objective": "regression",
            "metric": "rmse",
            'random_state': 42,
            "n_estimators" : 12001,
            "verbosity": -1,
            "device_type": "cpu",
            **best_params
        }

        model = lgb.LGBMRegressor(**params)

        early_stopping_callback = lgb.early_stopping(200, first_metric_only=True, verbose=False)
        verbose_callback = lgb.callback.record_evaluation({})

        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)],  
                  callbacks=[early_stopping_callback, verbose_callback],
        )

        valid_predict = model.predict(X_valid)
        oof_valid_preds[valid_idx] = valid_predict

        test_predict = model.predict(X_test)
        test_predict_list.append(test_predict)

        score = metrics.mean_squared_error(y_valid, valid_predict, squared=False)
        print("Fold RMSE Score : ", score)

        models_dict[f'{fold}_{i}'] = model


    oof_score = metrics.mean_squared_error(train_feats[target_col], oof_valid_preds, squared=False)
    scores.append(oof_score)
    print("OOF RMSE Score : ", oof_score)

==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-
Fold :  0
Trian : (2223, 656) (2223, 1)
Valid : (248, 656) (248, 1)
Fold RMSE Score :  0.5761343114934508
==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-
Fold :  1
Trian : (2224, 656) (2224, 1)
Valid : (247, 656) (247, 1)
Fold RMSE Score :  0.5164412370397702
==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-
Fold :  2
Trian : (2224, 656) (2224, 1)
Valid : (247, 656) (247, 1)
Fold RMSE Score :  0.6792416920435415
==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-==-
Fold :  3
Trian : (2224, 656) (2224, 1)
Valid : (247, 656) (247, 1)
Fold RMSE 

In [64]:
np.mean(scores)

0.6099770226513618

In [65]:
test_feats['score'] = np.mean(test_predict_list, axis=0)
sub3 = test_feats[['id', 'score']]
sub3

Unnamed: 0,id,score
0,0000aaaa,1.500086
1,2222bbbb,1.471744
2,4444cccc,1.474145


In [66]:
sub1.rename(columns={'score': 'score_1'}, inplace=True)
sub2.rename(columns={'score': 'score_2'}, inplace=True)
sub3.rename(columns={'score': 'score_3'}, inplace=True)

submission = pd.merge(sub1, sub2, on='id')
submission = pd.merge(submission, sub3, on='id')

submission['score'] = (submission['score_1']*0.2 +  #LGBM + NN (Weighted search for "print(W)")
                       submission['score_2']*0.4 +  #LGBM Public
                       submission['score_3']*0.4)   #Fusion

submission_final = submission[['id', 'score']]

In [67]:
submission_final.to_csv('submission.csv', index=False)

In [68]:
submission_final

Unnamed: 0,id,score
0,0000aaaa,1.406839
1,2222bbbb,1.263909
2,4444cccc,1.201256
