# Content

1. [Import](#1.-Import)
2. [Read data](#2.-Read-data)
3. [Data research](#3.-Data-research)
4. [Data preparation](#4.-Data-preparation)
    * [4.1. Text preparation functions](#4.1.-Text-preparation-functions)
    * [4.2. Text preparation](#4.2.-Text-preparation)
    * [4.3. Split test and train data](#4.3.-Split-test-and-train-data)
5. [Modeling](#5.-Modeling)
    * [5.1. Modeling import](#5.1.-Modeling-import)
    * [5.2. BernoulliNB](#5.2.-BernoulliNB)
    * [5.3. Decision Tree Classifier](#5.3.-Decision-Tree-Classifier)
    * [5.4. Extra Trees Classifier](#5.4.-ExtraTreesClassifier)
    * [5.5. KNeighbors Classifier](#5.5.-KNeighbors-Classifier)
    * [5.6. LinearSVC](#5.6.-LinearSVC)
    * [5.7. Logistic RegressionCV](#5.7.-LogisticRegressionCV)
    * [5.8. MLPClassifier](#5.8.-MLPClassifier)
    * [5.9. Random Forest Classifier](#5.9.-Random-Forest-Classifier)
    * [5.10. Ridge Classifier](#5.10.-Ridge-Classifier)
    * [5.11. RidgeClassifierCV](#5.11.-RidgeClassifierCV)
    * [5.12. SVC](#5.12.-SVC)
    * [5.13. Gradient Boosting Classifier](#5.13.-Gradient-Boosting-Classifier)
    * [5.14. LinearSVC](#5.14.-LinearSVC)
    * [5.15. LogisticRegression](#5.15.-LogisticRegression)
    * [5.16. SGDClassifier](#5.16.-SGDClassifier)
    * [5.17. Perceptron](#5.17.-Perceptron)
    * [5.18. Passive Aggressive Classifier](#5.18.-Passive-Aggressive-Classifier)
    * [5.19. Model evaluation](#5.19.-Model-evaluation)


### Work in progress...

# 1. Import

In [59]:
import os
import re
import string

In [60]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [61]:
# Vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [62]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

In [63]:
from uk_stemmer import UkStemmer

In [64]:
stemmer = UkStemmer()

def stemming(text):    
    prepared_text = text.lower()
    words = re.split(r'(\W)', prepared_text)
    words = [word for word in words if word != '']
    for i in range(len(words)):
        words[i] = stemmer.stem_word(words[i])
    stemmed_text = ''.join(words)
    return stemmed_text

# 2. Read data

In [65]:
train = pd.read_csv('data/data-simply-cleaned.csv')

In [66]:
train.text = train.text.apply(stemming)

# 3. Data research

In [67]:
train.head()

Unnamed: 0,text,target
0,"forty-one thousand, one hundred and fifteen за...",positive
1,замовил globex gu803 чохол globex eight thirty...,negative
2,замовил нов посудомийн машин 1102доставил one ...,negative
3,замов намет в неділ ввечер мен вон потрібн вже...,negative
4,замов навушник на розетц через оплат частин пе...,negative


In [68]:
train.shape

(3034, 2)

In [69]:
train.isna().sum()

text      0
target    0
dtype: int64

In [70]:
train.target.unique()

array(['positive', 'negative'], dtype=object)

In [71]:
text = train.text
target = train.target

# 4. Data preparation

In [72]:
def do_nothing(tokens):
    return tokens

# TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(
    tokenizer=do_nothing, 
    preprocessor=None,
    lowercase=False,
    # ngram_range=(1, 2)
)

In [73]:
text_counts = tfidf_vectorizer.fit_transform(text)

### 4.3. Split test and train data

In [74]:
X_train, X_test, y_train, y_test = train_test_split(
    text_counts, 
    target, 
    test_size=0.3, 
    random_state=1
)

In [75]:
print("X train shape: {0}".format(X_train.shape))
print("Y train shape: {0}".format(y_train.shape))
print("X test shape: {0}".format(X_test.shape))
print("Y test shape: {0}".format(y_test.shape))

X train shape: (2123, 78)
Y train shape: (2123,)
X test shape: (911, 78)
Y test shape: (911,)


# 5. Modeling

### 5.1. Modeling import

In [76]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV 
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import RidgeClassifierCV

In [77]:
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier

In [78]:
def model_scoring(clf, X_test, y_test):
    predicted= clf.predict(X_test)
    print(classification_report(y_test, predicted))

### 5.2. BernoulliNB

In [79]:
%%time
# BernoulliNB

clf = BernoulliNB().fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_bernouli_nb = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.75      0.69      0.72       590
    positive       0.50      0.56      0.53       321

    accuracy                           0.65       911
   macro avg       0.62      0.63      0.63       911
weighted avg       0.66      0.65      0.65       911

CPU times: user 42.4 ms, sys: 3.14 ms, total: 45.5 ms
Wall time: 46.4 ms


### 5.3. Decision Tree Classifier

In [80]:
%%time
# DecisionTreeClassifier

clf = DecisionTreeClassifier().fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_decision_tree = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.73      0.71      0.72       590
    positive       0.49      0.52      0.50       321

    accuracy                           0.64       911
   macro avg       0.61      0.61      0.61       911
weighted avg       0.64      0.64      0.64       911

CPU times: user 217 ms, sys: 2.46 ms, total: 219 ms
Wall time: 231 ms


### 5.4. ExtraTreesClassifier

In [81]:
%%time
# ExtraTreesClassifier

clf = ExtraTreesClassifier().fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_extra_tree = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.74      0.88      0.81       590
    positive       0.67      0.43      0.53       321

    accuracy                           0.73       911
   macro avg       0.71      0.66      0.67       911
weighted avg       0.72      0.73      0.71       911

CPU times: user 2.44 s, sys: 17.8 ms, total: 2.46 s
Wall time: 2.52 s


### 5.5. KNeighbors Classifier

In [82]:
%%time
# KNeighborsClassifier

clf = KNeighborsClassifier().fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_knn = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.73      0.81      0.77       590
    positive       0.56      0.45      0.49       321

    accuracy                           0.68       911
   macro avg       0.64      0.63      0.63       911
weighted avg       0.67      0.68      0.67       911

CPU times: user 465 ms, sys: 35.3 ms, total: 501 ms
Wall time: 517 ms


### 5.6. LinearSVC

In [83]:
%%time
# LinearSVC  (setting multi_class=”crammer_singer”)

clf = LinearSVC(multi_class="crammer_singer").fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_linear_svc = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.73      0.84      0.78       590
    positive       0.59      0.42      0.49       321

    accuracy                           0.69       911
   macro avg       0.66      0.63      0.63       911
weighted avg       0.68      0.69      0.68       911

CPU times: user 97.7 ms, sys: 3.1 ms, total: 101 ms
Wall time: 118 ms




### 5.7. LogisticRegressionCV

In [84]:
%%time
# LogisticRegressionCV(setting multi_class=”multinomial”)

clf = LogisticRegressionCV(multi_class="multinomial").fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_logistic_cv = round(clf.score(X_test, y_test), 2)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

              precision    recall  f1-score   support

    negative       0.78      0.83      0.80       590
    positive       0.64      0.56      0.60       321

    accuracy                           0.74       911
   macro avg       0.71      0.70      0.70       911
weighted avg       0.73      0.74      0.73       911

CPU times: user 3.65 s, sys: 18.3 ms, total: 3.67 s
Wall time: 3.7 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### 5.8. MLPClassifier

In [85]:
%%time
# MLPClassifier

clf = MLPClassifier().fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_mlp = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.79      0.77      0.78       590
    positive       0.60      0.63      0.61       321

    accuracy                           0.72       911
   macro avg       0.69      0.70      0.70       911
weighted avg       0.72      0.72      0.72       911

CPU times: user 8.73 s, sys: 671 ms, total: 9.4 s
Wall time: 4.88 s




### 5.9. Random Forest Classifier

In [86]:
%%time
# RandomForestClassifier()

clf = RandomForestClassifier().fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_random_forest = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.74      0.86      0.80       590
    positive       0.64      0.45      0.52       321

    accuracy                           0.72       911
   macro avg       0.69      0.65      0.66       911
weighted avg       0.70      0.72      0.70       911

CPU times: user 2.01 s, sys: 26.1 ms, total: 2.03 s
Wall time: 1.95 s


### 5.10. Ridge Classifier

In [87]:
%%time
# RidgeClassifier

clf = RidgeClassifier().fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_ridge = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.75      0.86      0.80       590
    positive       0.64      0.47      0.55       321

    accuracy                           0.72       911
   macro avg       0.70      0.67      0.67       911
weighted avg       0.71      0.72      0.71       911

CPU times: user 59.6 ms, sys: 6.54 ms, total: 66.1 ms
Wall time: 69 ms


### 5.11. RidgeClassifierCV

In [88]:
%%time
# RidgeClassifierCV

clf = RidgeClassifier().fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_ridge_cv = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.75      0.86      0.80       590
    positive       0.64      0.47      0.55       321

    accuracy                           0.72       911
   macro avg       0.70      0.67      0.67       911
weighted avg       0.71      0.72      0.71       911

CPU times: user 52 ms, sys: 4.73 ms, total: 56.8 ms
Wall time: 60.8 ms


### 5.12. SVC

In [89]:
%%time
# SVC

clf = SVC().fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_svc = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.73      0.82      0.77       590
    positive       0.57      0.44      0.49       321

    accuracy                           0.68       911
   macro avg       0.65      0.63      0.63       911
weighted avg       0.67      0.68      0.67       911

CPU times: user 1.09 s, sys: 9.71 ms, total: 1.1 s
Wall time: 1.13 s


### 5.13. Gradient Boosting Classifier

In [90]:
%%time
# GradientBoostingClassifier

clf = GradientBoostingClassifier().fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_gbc = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.76      0.82      0.79       590
    positive       0.62      0.53      0.57       321

    accuracy                           0.72       911
   macro avg       0.69      0.67      0.68       911
weighted avg       0.71      0.72      0.71       911

CPU times: user 3.07 s, sys: 11.1 ms, total: 3.08 s
Wall time: 3.1 s


### 5.14. LinearSVC

In [91]:
%%time
# LinearSVC

clf = LinearSVC(multi_class = "ovr").fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_linear_svc2 = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.76      0.85      0.80       590
    positive       0.65      0.50      0.56       321

    accuracy                           0.73       911
   macro avg       0.70      0.67      0.68       911
weighted avg       0.72      0.73      0.72       911

CPU times: user 71.1 ms, sys: 3.82 ms, total: 74.9 ms
Wall time: 76 ms


### 5.15. LogisticRegression

In [92]:
%%time
# LogisticRegression multi_class=”ovr”

clf = LogisticRegression(multi_class="ovr").fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_logistic_reg = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.73      0.86      0.79       590
    positive       0.61      0.40      0.49       321

    accuracy                           0.70       911
   macro avg       0.67      0.63      0.64       911
weighted avg       0.69      0.70      0.68       911

CPU times: user 71.8 ms, sys: 2.97 ms, total: 74.8 ms
Wall time: 77.9 ms


### 5.16. SGDClassifier

In [93]:
%%time
# SGDClassifier

clf = SGDClassifier().fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_sgd = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.78      0.77      0.78       590
    positive       0.59      0.60      0.59       321

    accuracy                           0.71       911
   macro avg       0.68      0.68      0.68       911
weighted avg       0.71      0.71      0.71       911

CPU times: user 59.6 ms, sys: 2.97 ms, total: 62.6 ms
Wall time: 63.3 ms


### 5.17. Perceptron

In [94]:
%%time
# Perceptron

clf = Perceptron().fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_perceptron = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.87      0.53      0.66       590
    positive       0.50      0.85      0.63       321

    accuracy                           0.64       911
   macro avg       0.68      0.69      0.64       911
weighted avg       0.74      0.64      0.65       911

CPU times: user 45.5 ms, sys: 2.74 ms, total: 48.2 ms
Wall time: 48.7 ms


### 5.18. Passive Aggressive Classifier

In [95]:
%%time
# PassiveAggressiveClassifier

clf = PassiveAggressiveClassifier().fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_pac = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.77      0.83      0.80       590
    positive       0.64      0.55      0.59       321

    accuracy                           0.73       911
   macro avg       0.71      0.69      0.70       911
weighted avg       0.73      0.73      0.73       911

CPU times: user 48.4 ms, sys: 2.41 ms, total: 50.8 ms
Wall time: 50.8 ms


### 5.19. Model evaluation

In [96]:
# evaluation

model_results = pd.DataFrame({
    'Models': [
        'BernoulliNB',
        'Decision Tree',
        'Extra Tree',
        'KNN',
        'Linear SVC',
        'Logistic Regression CV',
        'MLP',
        'Random Forest',
        'Ridge',
        'Ridge CV',
        'SVC',
        'GBC',
        'Linear SVC 2',
        'Logistic Regression',
        'SGDC',
        'Perceptron',
        'PAC'
    ],
    'Scores': [
        acc_bernouli_nb,
        acc_decision_tree,
        acc_extra_tree,
        acc_knn,
        acc_linear_svc,
        acc_logistic_cv,
        acc_mlp,
        acc_random_forest,
        acc_ridge,
        acc_ridge_cv,
        acc_svc,
        acc_gbc,
        acc_linear_svc2,
        acc_logistic_reg,
        acc_sgd,
        acc_perceptron,
        acc_pac
    ]
})
model_results.sort_values(by='Scores', ascending=False)

Unnamed: 0,Models,Scores
5,Logistic Regression CV,0.74
16,PAC,0.73
2,Extra Tree,0.73
12,Linear SVC 2,0.73
7,Random Forest,0.72
11,GBC,0.72
9,Ridge CV,0.72
8,Ridge,0.72
6,MLP,0.72
14,SGDC,0.71
