In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

from xgboost import XGBClassifier



In [2]:
path_data = "/home/schlemuel/OneDrive/Documentos/Bancos de dados/Kaggle/automated-essay-scoring"

# Cleaning

In [3]:
df_train = pd.read_csv(path_data + '/train.csv')
df_test = pd.read_csv(path_data + '/test.csv')

In [4]:
df_train.describe()

Unnamed: 0,score
count,17307.0
mean,2.948402
std,1.044899
min,1.0
25%,2.0
50%,3.0
75%,4.0
max,6.0


In [5]:
df_train.head(10)

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3
5,0030e86,If I were to choose between keeping the electo...,4
6,0033037,The posibilty of a face reconizing computer wo...,2
7,0033bf4,What is the Seagoing Cowboys progam?\n\nIt was...,3
8,0036253,The challenge of exploring Venus\n\nThis stori...,2
9,0040e27,There are many reasons why you should join sea...,3


In [6]:
def clean(text):
    delete = ['\n']
    
    for word in delete:
        text = text.replace(word, ' ')
    
    return text
    

In [7]:
df_train['full_text'] = df_train['full_text'].apply(lambda text: clean(text))
df_test['full_text'] = df_test['full_text'].apply(lambda text: clean(text))

In [8]:
df_train.head(10)

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator This is a letter to argue...",3
5,0030e86,If I were to choose between keeping the electo...,4
6,0033037,The posibilty of a face reconizing computer wo...,2
7,0033bf4,What is the Seagoing Cowboys progam? It was t...,3
8,0036253,The challenge of exploring Venus This storie ...,2
9,0040e27,There are many reasons why you should join sea...,3


## Local Only

In [9]:
df_train, df_test = train_test_split(df_train, test_size=0.2, random_state=37)

In [10]:
print(df_train.shape)
print(df_test.shape)

(13845, 3)
(3462, 3)


# Training

In [11]:
count_vect = CountVectorizer(lowercase=False)
x_train = count_vect.fit_transform(df_train['full_text'])
y_train = df_train['score'].values - 1
x_test = count_vect.transform(df_test['full_text'])

In [61]:
parameter = {
    'learning_rate': 0.05, 
    'n_estimators': 800, 
    'max_depth': 8, 
    'subsample': 0.7
}

In [62]:
%%time

xgb_model = XGBClassifier(**parameter)
xgb_model.fit(x_train, y_train)

CPU times: user 38min 7s, sys: 4.35 s, total: 38min 12s
Wall time: 10min 33s


# Accuracy

In [63]:
predictions = xgb_model.predict(x_test)

In [64]:
accuracy = accuracy_score(df_test['score'], predictions + 1)
print(f"Model Accuracy: {round(accuracy * 100, 4)}%")

Model Accuracy: 60.2831%


### The Best

60.2831%;

CountVectorizer(lowercase=False);

  {'learning_rate': 0.05,
   'n_estimators': 800,
   'max_depth': 8,
   'subsample': 0.7}

# Many

In [11]:
parameters = [
{
    'learning_rate': 0.05, 
    'n_estimators': 800, 
    'max_depth': 8, 
    'subsample': 0.7
}
]

better = []

count_vect = CountVectorizer(lowercase=False)

x_train = count_vect.fit_transform(df_train['full_text'])
y_train = df_train['score'].values - 1
x_test = count_vect.transform(df_test['full_text'])

for parameter in parameters:
    xgb_model = XGBClassifier(**parameter)
    xgb_model.fit(x_train, y_train)

    predictions = xgb_model.predict(x_test)

    accurancy = accuracy_score(df_test['score'], predictions + 1)
    print(accurancy)

    if accurancy > 0.602830:
        better.append([accurancy, parameter])

0.6028307336799538
0.5999422299248989
0.5921432697862508
0.5898324667822068
0.5930098209127672
0.5967648757943386
0.5924321201617562
0.5956094742923166
0.5912767186597343
0.5938763720392837
0.5895436164067013
0.5883882149046794
0.6028307336799538
0.5999422299248989
0.5918544194107452
0.5898324667822068
0.5930098209127672
0.5967648757943386


In [12]:
better

[[0.6028307336799538,
  {'learning_rate': 0.05,
   'n_estimators': 800,
   'max_depth': 8,
   'subsample': 0.7,
   'gamma': 0,
   'min_child_weight': 1,
   'lambda': 1,
   'alpha': 0}],
 [0.6028307336799538,
  {'learning_rate': 0.05,
   'n_estimators': 800,
   'max_depth': 8,
   'subsample': 0.7,
   'gamma': 0,
   'min_child_weight': 1,
   'lambda': 1,
   'alpha': 0,
   'tree_method': 'approx'}]]