In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

from lightgbm import LGBMClassifier, Dataset



In [4]:
path_data = "/home/schlemuel/OneDrive/Documentos/Bancos de dados/Kaggle/automated-essay-scoring"

# Analizing and Cleaning

In [5]:
df_train = pd.read_csv(path_data + '/train.csv')
df_test = pd.read_csv(path_data + '/test.csv')

## Local Only

In [6]:
df_train, df_test = train_test_split(df_train, test_size=0.2, random_state=37)

In [7]:
print(df_train.shape)
print(df_test.shape)

(13845, 3)
(3462, 3)


## Returning

In [8]:
df_train.describe()

Unnamed: 0,score
count,13845.0
mean,2.948068
std,1.050022
min,1.0
25%,2.0
50%,3.0
75%,4.0
max,6.0


In [9]:
df_train.head(10)

Unnamed: 0,essay_id,full_text,score
12928,be12d44,Its just a natural landform because it says in...,2
6154,5c0dcf4,"In the article ""Making Mona Lisa Smile"", it de...",2
1062,10be2a0,Most people would think of an electoral collag...,2
4574,442f36d,The author has a point that we should study Ve...,3
14789,d9fce09,In recent years car ownership has decreased wi...,3
14927,dbcda24,Many car companies are working on driverless c...,3
4566,4408d03,"In the article ""Driverless Cars Are Coming,"" t...",3
1813,1bf8cee,When you're driving you are harming the planet...,3
4632,45076e3,"""So, if you're a NASA scientist, you should be...",3
2188,212f8b4,In todays world students often get board in a ...,3


In [10]:
def clean(text):
    delete = ['\n']
    
    for word in delete:
        text = text.replace(word, ' ')
    
    return text
    

In [11]:
df_train['full_text'] = df_train['full_text'].apply(lambda text: clean(text))
df_test['full_text'] = df_test['full_text'].apply(lambda text: clean(text))

In [12]:
df_train.shape

(13845, 3)

In [13]:
for value in df_train['score'].unique():
    print(f'{value}: {df_train[df_train["score"] == value].shape[0]}')

2: 3796
3: 4985
4: 3142
6: 135
5: 776
1: 1011


In [14]:
df_train.head(10)

Unnamed: 0,essay_id,full_text,score
12928,be12d44,Its just a natural landform because it says in...,2
6154,5c0dcf4,"In the article ""Making Mona Lisa Smile"", it de...",2
1062,10be2a0,Most people would think of an electoral collag...,2
4574,442f36d,The author has a point that we should study Ve...,3
14789,d9fce09,In recent years car ownership has decreased wi...,3
14927,dbcda24,Many car companies are working on driverless c...,3
4566,4408d03,"In the article ""Driverless Cars Are Coming,"" t...",3
1813,1bf8cee,When you're driving you are harming the planet...,3
4632,45076e3,"""So, if you're a NASA scientist, you should be...",3
2188,212f8b4,In todays world students often get board in a ...,3


# Training

In [15]:
count_vect = CountVectorizer(lowercase=False)
x_train = count_vect.fit_transform(df_train['full_text']).astype(np.float64)
y_train = df_train['score'].values - 1
x_test = count_vect.transform(df_test['full_text']).astype(np.float64)

In [16]:
parameter = {'learning_rate': 0.03, 
             'n_estimators': 600, 
             'max_depth': 10, 
             'subsample': 1, 
             'subsample_freq': 9, 
             'n_jobs': -1, 
             'importance_type': 'split'}

In [17]:
%%time

lgbm_model = LGBMClassifier(**parameter, verbose = -1)
lgbm_model.fit(x_train, y_train)

Wall time: 1min 49s


## Accurancy

In [18]:
predictions = lgbm_model.predict(x_test)

In [19]:
accuracy = accuracy_score(df_test['score'], predictions + 1)
print(f"Model Accuracy: {round(accuracy * 100, 4)}%")

Model Accuracy: 60.4275%


# Many

In [16]:
parameters = {
    'learning_rate': [0.03], 
    'n_estimators': [600], 
    'max_depth': [10]
}

count_vect = CountVectorizer(lowercase=False)
x_train = count_vect.fit_transform(df_train['full_text']).astype(np.float64)
y_train = df_train['score'].values - 1
x_test = count_vect.transform(df_test['full_text']).astype(np.float64)

best = {'score': 0}

for lr in parameters['learning_rate']:
    for n_trees in parameters['n_estimators']:
        for depth in parameters['max_depth']:
            for nj in parameters['n_jobs']:
                for importance_type in parameters['importance_type']:
            
                    parameter = {
                        'learning_rate': lr,
                        'n_estimators': n_trees,
                        'max_depth': depth,
                        'subsample': sb,
                        'subsample_freq': sbf,
                        'n_jobs': nj,
                        'importance_type': importance_type
                    }

                    lgbm_model = LGBMClassifier(**parameter, verbose = -1)
                    lgbm_model.fit(x_train, y_train)

                    predictions = lgbm_model.predict(x_test)
                    accuracy = accuracy_score(df_test['score'], predictions + 1)

                    if accuracy > best['score']:
                        best['score'] = accuracy

                        best['parameter'] = parameter

In [17]:
print(best)

{'score': 0.6042749855574813, 'parameter': {'learning_rate': 0.03, 'n_estimators': 600, 'max_depth': 10, 'subsample': 1, 'subsample_freq': 9, 'n_jobs': -1, 'importance_type': 'split'}}


{'score': 0.6042749855574813, 'parameter': {'learning_rate': 0.03, 'n_estimators': 600, 'max_depth': 10, 'subsample': 1, 'subsample_freq': 9, 'n_jobs': -1, 'importance_type': 'split'}}