In [13]:
import numpy as np
import pandas as pd
import re

import nltk
import spacy
from nltk.corpus import stopwords

nlp = spacy.load('en')

In [14]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
!pip install fasttext

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Collecting fasttext
[?25l  Downloading https://files.pythonhosted.org/packages/f8/85/e2b368ab6d3528827b147fdb814f8189acc981a4bc2f99ab894650e05c40/fasttext-0.9.2.tar.gz (68kB)
[K     |████████████████████████████████| 71kB 2.9MB/s 
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp36-cp36m-linux_x86_64.whl size=3019556 sha256=247a667413e172504420082eeef33dd3256d0ace836371c72dd4e27fc5328cbb
  Stored in directory: /root/.cache/pip/wheels/98/ba/7f/b154944a1cf5a8cee91c154b75231136cc3a3321ab0e30f592
Successfully built fasttext
Installing collected packages:

In [15]:
col_names = ['questions', 'a', 'b']
data_df = pd.read_csv("https://raw.githubusercontent.com/VIthulan/travel-text-classification/master/data/5000TravelQuestionsDataset.csv", error_bad_lines=False,header=None, names=col_names, encoding='latin-1')

In [16]:
data_df.head()

Unnamed: 0,questions,a,b
0,What are the special things we (husband and me...,TTD,TTDSIG
1,What are the companies which organize shark fe...,TTD,TTDOTH
2,Is it safe for female traveller to go alone to...,TGU,TGUHEA
3,What are the best places around Cape Town for ...,TTD,TTDSIG
4,What are the best places to stay for a family ...,ACM,ACMOTH


In [17]:
data_df['a'].value_counts()

TGU      1217
TTD      1139
TRS      1011
ACM       720
FOD       521
ENT       214
WTH       172
TGU\n       3
\nENT       2
TTD\n       1
Name: a, dtype: int64

# Text Preprocessing

In [18]:
stop_words = set(stopwords.words('english')) 

def remove_stopwords(text):
  word_tokens = nltk.word_tokenize(text) 
  filtered_sentence = [w for w in word_tokens if not w in stop_words] 
  return " ".join(filtered_sentence)

In [19]:
# Remove all the special characters
data_df['processed_questions'] = data_df['questions'].str.replace(r'\W', ' ')
    # remove all single characters
data_df['processed_questions'] = data_df['processed_questions'].str.replace(r'\s+[a-zA-Z]\s+', ' ')
    # Remove single characters from the start
data_df['processed_questions'] = data_df['questions'].str.replace(r'\^[a-zA-Z]\s+', ' ')
    # Substituting multiple spaces with single space
data_df['processed_questions'] = data_df['questions'].str.replace(r'\s+', ' ')
    # Removing prefixed 'b'
data_df['processed_questions'] = data_df['questions'].str.replace(r'^b\s+', '')
    # Remove leading, trailing spaces
data_df['processed_questions'] = data_df['questions'].str.strip()
# Stop word removal
data_df['sw_removed_questions'] = data_df.processed_questions.apply(remove_stopwords)

In [20]:
# Remove all the special characters
data_df['processed_a'] = data_df['a'].str.replace(r'\W', ' ')
    # remove all single characters
data_df['processed_a'] = data_df['a'].str.replace(r'\s+[a-zA-Z]\s+', ' ')
    # Remove single characters from the start
data_df['processed_a'] = data_df['a'].str.replace(r'\^[a-zA-Z]\s+', ' ')
    # Substituting multiple spaces with single space
data_df['processed_a'] = data_df['a'].str.replace(r'\s+', ' ')
    # Removing prefixed 'b'
data_df['processed_a'] = data_df['a'].str.replace(r'^b\s+', '')
    # Remove leading, trailing spaces
data_df['processed_a'] = data_df['a'].str.strip()

## Lemmatizing


In [21]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
  lem = [lemmatizer.lemmatize(w, pos="v") for w in nltk.word_tokenize(text)]
  return " ".join(lem)

In [22]:
data_df["question_lemmatized"] = data_df.processed_questions.apply(lemmatize_text)

In [23]:
data_df["question_lemmatized_sw"] = data_df.sw_removed_questions.apply(lemmatize_text)

In [24]:
data_df.head()

Unnamed: 0,questions,a,b,processed_questions,sw_removed_questions,processed_a,question_lemmatized,question_lemmatized_sw
0,What are the special things we (husband and me...,TTD,TTDSIG,What are the special things we (husband and me...,What special things ( husband ) 5 day stay Cap...,TTD,What be the special things we ( husband and me...,What special things ( husband ) 5 day stay Cap...
1,What are the companies which organize shark fe...,TTD,TTDOTH,What are the companies which organize shark fe...,What companies organize shark feeding events s...,TTD,What be the company which organize shark feed ...,What company organize shark feed events scuba ...
2,Is it safe for female traveller to go alone to...,TGU,TGUHEA,Is it safe for female traveller to go alone to...,Is safe female traveller go alone Cape Town ?,TGU,Is it safe for female traveller to go alone to...,Is safe female traveller go alone Cape Town ?
3,What are the best places around Cape Town for ...,TTD,TTDSIG,What are the best places around Cape Town for ...,What best places around Cape Town safari ?,TTD,What be the best place around Cape Town for sa...,What best place around Cape Town safari ?
4,What are the best places to stay for a family ...,ACM,ACMOTH,What are the best places to stay for a family ...,What best places stay family stay away nightli...,ACM,What be the best place to stay for a family to...,What best place stay family stay away nightlife ?


In [25]:
data_df['processed_a'].value_counts()

TGU    1220
TTD    1140
TRS    1011
ACM     720
FOD     521
ENT     216
WTH     172
Name: processed_a, dtype: int64

In [63]:
def sent_tokenize(sent):
    temp=nlp(sent)
    return [str(token) for token in temp if not token.is_stop]

In [65]:
X=[sent_tokenize(sent) for sent in data_df.question_lemmatized_sw]

In [86]:
X[:3]

[['special',
  'things',
  '(',
  'husband',
  ')',
  '5',
  'day',
  'stay',
  'Cape',
  'Town',
  '?'],
 ['company', 'organize', 'shark', 'feed', 'events', 'scuba', 'divers', '?'],
 ['safe', 'female', 'traveller', 'Cape', 'Town', '?']]

# FastText

In [26]:
# !pwd
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz

In [27]:
!ls
!ls /content/cc.en.300.bin.gz

cc.en.300.bin.gz  sample_data
/content/cc.en.300.bin.gz


In [28]:
# from gensim.models.wrappers import FastText
# model = FastText.load_fasttext_format('cc.en.300.bin.gz')


In [29]:
import fasttext.util
fasttext.util.download_model('en', if_exists='ignore')  # English
ft = fasttext.load_model('cc.en.300.bin')



In [30]:
he = ft["teacher"]

array([-3.37157771e-02,  1.86038446e-02, -2.43569463e-02,  2.98006386e-02,
       -5.41805997e-02, -7.43865520e-02,  9.18784738e-02,  5.72655760e-02,
       -3.24940979e-02, -2.79742619e-03, -4.66130748e-02, -6.81853816e-02,
       -3.45561430e-02,  1.01253584e-01,  2.45907363e-02,  1.91315524e-02,
       -6.44066092e-03, -3.63045335e-02,  6.24461137e-02,  5.49906157e-02,
        1.62455009e-03,  2.67509967e-02,  7.10245520e-02,  2.77895182e-02,
       -3.60139966e-04, -2.80277729e-02,  1.63025986e-02,  3.47395032e-03,
        4.01230343e-02,  8.95214677e-02, -2.33499762e-02, -6.42535910e-02,
       -1.32474136e-02,  1.71577297e-02, -1.54677406e-02,  1.85072795e-02,
        1.22898594e-02, -5.67965209e-02, -2.69526951e-02, -5.92068210e-03,
       -1.43350456e-02, -1.55877015e-02, -8.71555954e-02,  1.33018717e-01,
       -7.59121031e-02, -1.79398712e-02,  3.16942446e-02,  2.57759914e-02,
       -2.78638862e-02, -9.29461643e-02,  3.61621156e-02,  8.17714036e-02,
       -4.87468205e-02,  

In [41]:
def get_embedding(word):
  embedding=ft[word]
  return embedding

In [54]:
def get_embedding_sent(word):
  embedding=ft.get_sentence_vector(word)
  return embedding

In [53]:
get_embedding('Is it safe for female traveller to go alone')

array([ 4.61856660e-04,  5.30219497e-03,  2.81710224e-03,  2.11101957e-02,
        8.62310175e-03, -2.36063134e-02, -2.60064751e-03,  2.36293320e-02,
       -1.81833152e-02, -3.58765433e-03,  5.69610018e-03, -5.22228237e-03,
        1.50106935e-04, -3.42817348e-03, -1.85040745e-03, -1.97240766e-02,
        1.63222030e-02, -1.50003396e-02, -8.22589733e-03,  3.29664797e-02,
       -1.33504858e-02, -2.08280631e-03,  4.92887199e-03, -1.24442643e-02,
        1.78519115e-02,  2.33267806e-02,  5.20195812e-03,  5.43088745e-03,
        2.98720249e-03,  4.55845371e-02, -9.50518704e-04, -1.02203898e-02,
        7.97673594e-03,  2.25421065e-03,  1.58228129e-02,  3.16840503e-03,
        2.10847650e-02, -2.70279148e-03, -1.10337965e-03,  2.15005632e-02,
       -1.28924493e-02,  6.60970295e-03,  7.61919189e-03, -7.86937773e-03,
        1.89654492e-02,  2.48041749e-02, -3.07901087e-03,  2.39925515e-02,
        9.64020845e-03,  1.35175057e-03,  9.43492539e-03, -1.42091857e-02,
       -7.95707386e-03,  

In [55]:
get_embedding_sent('Is it safe for female traveller to go alone')

array([ 1.99353285e-02, -2.51277350e-02, -1.29354326e-02,  3.10847759e-02,
       -3.50363031e-02, -1.14462841e-02,  3.08552980e-02,  9.53554083e-03,
       -1.79380423e-03, -1.49666965e-02, -1.86224245e-02, -1.50225013e-02,
        2.10175174e-03,  5.27123967e-03, -6.96123624e-03,  4.69980612e-02,
        5.19246678e-04, -1.87728945e-02, -2.00445130e-02,  8.59542862e-02,
       -9.66099184e-03,  5.33650443e-03,  3.23488307e-03, -3.14749330e-02,
       -3.98532860e-02, -2.82699596e-02, -2.55425237e-02, -1.06854122e-02,
       -3.87414801e-03,  7.34022781e-02, -3.08823287e-02,  2.22750474e-03,
        7.48492777e-03,  1.34861572e-02,  5.56118460e-03, -3.13137134e-04,
       -2.36102808e-02,  1.56331845e-02,  3.00640408e-02,  9.66661237e-03,
       -4.77534207e-03, -1.48985190e-02,  7.59487692e-03,  1.63618047e-02,
        1.23426944e-04, -1.32260844e-02, -2.22837441e-02,  1.83735415e-02,
        5.57468832e-03, -1.22733796e-02,  1.92157421e-02,  7.16022588e-03,
       -5.08490438e-03, -

In [56]:
X_train_embeddings=[np.mean(np.array(list(map(get_embedding,tok_sent))),axis=0) for tok_sent in data_df["question_lemmatized"]]
X_train_sent_embedding=[np.mean(np.array(list(map(get_embedding_sent,tok_sent))),axis=0) for tok_sent in data_df["question_lemmatized"]]

In [66]:
X_train_wt_embedding=[np.mean(np.array(list(map(get_embedding_sent,tok_sent))),axis=0) for tok_sent in X]

In [88]:
X_wt_emb=[np.mean(np.array(list(map(get_embedding,tok_sent))),axis=0) for tok_sent in X]

In [43]:
len(X_train_embeddings)
X_train_embeddings[10].shape

(300,)

In [52]:
x_encoded = np.array(X_train_embeddings)
x_encoded.shape

(5000, 300)

In [67]:
x_encoded_wt = np.array(X_train_wt_embedding)
x_encoded_wt.shape

(5000, 300)

In [89]:
x_encoded_wt_word = np.array(X_wt_emb)
x_encoded_wt_word.shape

(5000, 300)

In [45]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

y_encoded = le.fit_transform(data_df['processed_a'])

In [51]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


cv = KFold(n_splits=10, random_state=1, shuffle=True)
fold = 0
accuracies = []
for train_index, test_index in cv.split(x_encoded):
    fold += 1
    X_train, X_test = x_encoded[train_index], x_encoded[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]
    print("Beginning fold: ", fold)
    SVM = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(X_train,y_train)
    predictions_SVM1 = SVM.predict(X_test)
    acc = accuracy_score(predictions_SVM1, y_test)*100
    accuracies.append(acc)
    print("K-Fold: {} - {} - {:.2f}".format(fold, "Accuracy: ",acc))
    
print("Mean {:.2f} Std {:.2f}".format(np.mean(accuracies), np.std(accuracies)))

Beginning fold:  1
K-Fold: 1 - Accuracy:  - 37.20
Beginning fold:  2
K-Fold: 2 - Accuracy:  - 41.20
Beginning fold:  3
K-Fold: 3 - Accuracy:  - 37.00
Beginning fold:  4
K-Fold: 4 - Accuracy:  - 41.80
Beginning fold:  5
K-Fold: 5 - Accuracy:  - 39.40
Beginning fold:  6
K-Fold: 6 - Accuracy:  - 42.20
Beginning fold:  7
K-Fold: 7 - Accuracy:  - 37.80
Beginning fold:  8
K-Fold: 8 - Accuracy:  - 39.40
Beginning fold:  9
K-Fold: 9 - Accuracy:  - 39.40
Beginning fold:  10
K-Fold: 10 - Accuracy:  - 37.80
Mean 39.32 Std 1.80


In [57]:
x_encoded_sent = np.array(X_train_sent_embedding)
x_encoded_sent.shape

(5000, 300)

In [62]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


cv = KFold(n_splits=10, random_state=1, shuffle=True)
fold = 0
accuracies = []
for train_index, test_index in cv.split(x_encoded_sent):
    fold += 1
    X_train, X_test = x_encoded_sent[train_index], x_encoded_sent[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]
    print("Beginning fold: ", fold)
    SVM = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(X_train,y_train)
    predictions_SVM1 = SVM.predict(X_test)
    acc = accuracy_score(predictions_SVM1, y_test)*100
    accuracies.append(acc)
    print("K-Fold: {} - {} - {:.2f}".format(fold, "Accuracy: ",acc))
    
print("Mean {:.2f} Std {:.2f}".format(np.mean(accuracies), np.std(accuracies)))

Beginning fold:  1
K-Fold: 1 - Accuracy:  - 23.00
Beginning fold:  2
K-Fold: 2 - Accuracy:  - 28.60
Beginning fold:  3
K-Fold: 3 - Accuracy:  - 27.00
Beginning fold:  4
K-Fold: 4 - Accuracy:  - 27.80
Beginning fold:  5
K-Fold: 5 - Accuracy:  - 27.20
Beginning fold:  6
K-Fold: 6 - Accuracy:  - 33.40
Beginning fold:  7
K-Fold: 7 - Accuracy:  - 23.00
Beginning fold:  8
K-Fold: 8 - Accuracy:  - 24.80
Beginning fold:  9
K-Fold: 9 - Accuracy:  - 27.80
Beginning fold:  10
K-Fold: 10 - Accuracy:  - 23.20
Mean 26.58 Std 3.07


In [68]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


cv = KFold(n_splits=10, random_state=1, shuffle=True)
fold = 0
accuracies = []
for train_index, test_index in cv.split(x_encoded_wt):
    fold += 1
    X_train, X_test = x_encoded_wt[train_index], x_encoded_wt[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]
    print("Beginning fold: ", fold)
    SVM = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(X_train,y_train)
    predictions_SVM1 = SVM.predict(X_test)
    acc = accuracy_score(predictions_SVM1, y_test)*100
    accuracies.append(acc)
    print("K-Fold: {} - {} - {:.2f}".format(fold, "Accuracy: ",acc))
    
print("Mean {:.2f} Std {:.2f}".format(np.mean(accuracies), np.std(accuracies)))

Beginning fold:  1
K-Fold: 1 - Accuracy:  - 77.40
Beginning fold:  2
K-Fold: 2 - Accuracy:  - 76.00
Beginning fold:  3
K-Fold: 3 - Accuracy:  - 76.60
Beginning fold:  4
K-Fold: 4 - Accuracy:  - 76.20
Beginning fold:  5
K-Fold: 5 - Accuracy:  - 77.00
Beginning fold:  6
K-Fold: 6 - Accuracy:  - 78.80
Beginning fold:  7
K-Fold: 7 - Accuracy:  - 76.80
Beginning fold:  8
K-Fold: 8 - Accuracy:  - 78.60
Beginning fold:  9
K-Fold: 9 - Accuracy:  - 77.20
Beginning fold:  10
K-Fold: 10 - Accuracy:  - 77.60
Mean 77.22 Std 0.88


# Hyperparamter tuning

In [78]:
from sklearn.model_selection import GridSearchCV 
from sklearn import svm

param_grid = {'C': [10,11,12,13,14,15,16,17,18,19,20],  
              'gamma': [0.01, 0.001, 0.005, 0.008]}

In [79]:
grid = GridSearchCV(svm.SVC( break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), param_grid, refit = True, verbose = 2, cv=3, n_jobs=4) 

grid.fit(x_encoded_wt[:1000], y_encoded[:1000])

Fitting 3 folds for each of 44 candidates, totalling 132 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    7.9s
[Parallel(n_jobs=4)]: Done 132 out of 132 | elapsed:   23.0s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight='balanced', coef0=0.0,
                           decision_function_shape='ovr', degree=3, gamma=0.1,
                           kernel='linear', max_iter=-1, probability=False,
                           random_state=None, shrinking=True, tol=0.001,
                           verbose=False),
             iid='deprecated', n_jobs=4,
             param_grid={'C': [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
                         'gamma': [0.01, 0.001, 0.005, 0.008]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

In [80]:
grid.best_estimator_

SVC(C=12, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

# Model

In [81]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


cv = KFold(n_splits=10, random_state=1, shuffle=True)
fold = 0
accuracies = []
for train_index, test_index in cv.split(x_encoded_wt):
    fold += 1
    X_train, X_test = x_encoded_wt[train_index], x_encoded_wt[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]
    print("Beginning fold: ", fold)
    SVM = SVC(C=12, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
    SVM.fit(X_train,y_train)
    predictions_SVM1 = SVM.predict(X_test)
    acc = accuracy_score(predictions_SVM1, y_test)*100
    accuracies.append(acc)
    print("K-Fold: {} - {} - {:.2f}".format(fold, "Accuracy: ",acc))
    
print("Mean {:.2f} Std {:.2f}".format(np.mean(accuracies), np.std(accuracies)))

Beginning fold:  1
K-Fold: 1 - Accuracy:  - 80.40
Beginning fold:  2
K-Fold: 2 - Accuracy:  - 78.00
Beginning fold:  3
K-Fold: 3 - Accuracy:  - 77.40
Beginning fold:  4
K-Fold: 4 - Accuracy:  - 80.20
Beginning fold:  5
K-Fold: 5 - Accuracy:  - 78.60
Beginning fold:  6
K-Fold: 6 - Accuracy:  - 78.20
Beginning fold:  7
K-Fold: 7 - Accuracy:  - 80.00
Beginning fold:  8
K-Fold: 8 - Accuracy:  - 79.00
Beginning fold:  9
K-Fold: 9 - Accuracy:  - 79.00
Beginning fold:  10
K-Fold: 10 - Accuracy:  - 82.60
Mean 79.34 Std 1.43


In [82]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


def print_report(y_test, y_pred):
    # Classification Report
    print(classification_report(y_test,y_pred))

    acc = accuracy_score(y_test, y_pred)*100
    print('Accuracy score: %.3f' % acc)

    print('F1 Score: %.3f' % f1_score(y_test, y_pred, average='weighted'))
    
    cm = confusion_matrix(y_test, y_pred)
    print("Confustion matrix: \n{}".format(cm))

In [84]:
print_report(predictions_SVM1,y_test)

              precision    recall  f1-score   support

           0       0.88      0.85      0.86        79
           1       0.78      0.69      0.73        26
           2       0.93      0.90      0.92        62
           3       0.73      0.80      0.77       102
           4       0.86      0.88      0.87        96
           5       0.78      0.76      0.77       117
           6       1.00      0.94      0.97        18

    accuracy                           0.83       500
   macro avg       0.85      0.83      0.84       500
weighted avg       0.83      0.83      0.83       500

Accuracy score: 82.600
F1 Score: 0.827
Confustion matrix: 
[[67  0  1  4  3  4  0]
 [ 1 18  0  3  1  3  0]
 [ 2  0 56  2  0  2  0]
 [ 1  2  0 82  6 11  0]
 [ 0  1  0  7 84  4  0]
 [ 5  2  3 14  4 89  0]
 [ 0  0  0  0  0  1 17]]


# Summary

I was able to get the F1 score of 82.6% with FastText. Used `cc.en.300.bin.gz` Below is the best results. 
```
              precision    recall  f1-score   support

           0       0.88      0.85      0.86        79
           1       0.78      0.69      0.73        26
           2       0.93      0.90      0.92        62
           3       0.73      0.80      0.77       102
           4       0.86      0.88      0.87        96
           5       0.78      0.76      0.77       117
           6       1.00      0.94      0.97        18

    accuracy                           0.83       500
   macro avg       0.85      0.83      0.84       500
weighted avg       0.83      0.83      0.83       500
```

* Accuracy score: 82.600
* F1 Score: 0.827
* Confustion matrix: 
```
[[67  0  1  4  3  4  0]
 [ 1 18  0  3  1  3  0]
 [ 2  0 56  2  0  2  0]
 [ 1  2  0 82  6 11  0]
 [ 0  1  0  7 84  4  0]
 [ 5  2  3 14  4 89  0]
 [ 0  0  0  0  0  1 17]]
 ```