# Content

1. [Import](#1.-Import)
2. [Read data](#2.-Read-data)
3. [Data research](#3.-Data-research)
4. [Data preparation](#4.-Data-preparation)
    * [4.1. Text preparation functions](#4.1.-Text-preparation-functions)
    * [4.2. Text preparation](#4.2.-Text-preparation)
    * [4.3. Split test and train data](#4.3.-Split-test-and-train-data)
5. [Modeling](#5.-Modeling)
    * [5.1. Modeling import](#5.1.-Modeling-import)
    * [5.2. BernoulliNB](#5.2.-BernoulliNB)
    * [5.3. Decision Tree Classifier](#5.3.-Decision-Tree-Classifier)
    * [5.4. Extra Trees Classifier](#5.4.-ExtraTreesClassifier)
    * [5.5. KNeighbors Classifier](#5.5.-KNeighbors-Classifier)
    * [5.6. LinearSVC](#5.6.-LinearSVC)
    * [5.7. Logistic RegressionCV](#5.7.-LogisticRegressionCV)
    * [5.8. MLPClassifier](#5.8.-MLPClassifier)
    * [5.9. Random Forest Classifier](#5.9.-Random-Forest-Classifier)
    * [5.10. Ridge Classifier](#5.10.-Ridge-Classifier)
    * [5.11. RidgeClassifierCV](#5.11.-RidgeClassifierCV)
    * [5.12. SVC](#5.12.-SVC)
    * [5.13. Gradient Boosting Classifier](#5.13.-Gradient-Boosting-Classifier)
    * [5.14. LinearSVC](#5.14.-LinearSVC)
    * [5.15. LogisticRegression](#5.15.-LogisticRegression)
    * [5.16. SGDClassifier](#5.16.-SGDClassifier)
    * [5.17. Perceptron](#5.17.-Perceptron)
    * [5.18. Passive Aggressive Classifier](#5.18.-Passive-Aggressive-Classifier)
    * [5.19. Model evaluation](#5.19.-Model-evaluation)


### Work in progress...

# 1. Import

In [1]:
import os
import re
import string

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [3]:
# Vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

In [5]:
from uk_stemmer import UkStemmer

In [6]:
stemmer = UkStemmer()

def stemming(text):    
    prepared_text = text.lower()
    words = re.split(r'(\W)', prepared_text)
    words = [word for word in words if word != '']
    for i in range(len(words)):
        words[i] = stemmer.stem_word(words[i])
    stemmed_text = ''.join(words)
    return stemmed_text

# 2. Read data

In [7]:
train = pd.read_csv('data/simply-cleaned-data-v2.csv')

In [8]:
train.text = train.text.apply(stemming)

# 3. Data research

In [9]:
train.head()

Unnamed: 0,text,target
0,замов ноутбук lenovo b мишк подарунок слов оп...,positive
1,замовил globex gu чохол globex клавіатур шкі...,negative
2,замовил нов посудомиин машин доставил доставк...,negative
3,замов намет неділ ввечер потрібн вже четвер са...,negative
4,замов навушник розетц оплат частин перш платіж...,negative


In [10]:
train.shape

(3034, 2)

In [11]:
train.isna().sum()

text      0
target    0
dtype: int64

In [12]:
train.target.unique()

array(['positive', 'negative'], dtype=object)

In [13]:
text = train.text
target = train.target

# 4. Data preparation

In [14]:
def do_nothing(tokens):
    return tokens

# TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(
    tokenizer=do_nothing, 
    preprocessor=None,
    lowercase=False,
    # ngram_range=(1, 2)
)

In [15]:
text_counts = tfidf_vectorizer.fit_transform(text)

### 4.3. Split test and train data

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    text_counts, 
    target, 
    test_size=0.3, 
    random_state=1
)

In [17]:
print("X train shape: {0}".format(X_train.shape))
print("Y train shape: {0}".format(y_train.shape))
print("X test shape: {0}".format(X_test.shape))
print("Y test shape: {0}".format(y_test.shape))

X train shape: (2123, 62)
Y train shape: (2123,)
X test shape: (911, 62)
Y test shape: (911,)


# 5. Modeling

### 5.1. Modeling import

In [18]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV 
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import RidgeClassifierCV

In [19]:
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier

In [20]:
def model_scoring(clf, X_test, y_test):
    predicted= clf.predict(X_test)
    print(classification_report(y_test, predicted))

### 5.2. BernoulliNB

In [21]:
%%time
# BernoulliNB

clf = BernoulliNB().fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_bernouli_nb = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.68      0.53      0.60       590
    positive       0.39      0.55      0.45       321

    accuracy                           0.54       911
   macro avg       0.54      0.54      0.53       911
weighted avg       0.58      0.54      0.55       911

CPU times: user 40.6 ms, sys: 3.07 ms, total: 43.6 ms
Wall time: 45.4 ms


### 5.3. Decision Tree Classifier

In [22]:
%%time
# DecisionTreeClassifier

clf = DecisionTreeClassifier().fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_decision_tree = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.68      0.64      0.66       590
    positive       0.40      0.45      0.43       321

    accuracy                           0.57       911
   macro avg       0.54      0.55      0.54       911
weighted avg       0.58      0.57      0.58       911

CPU times: user 178 ms, sys: 2.98 ms, total: 181 ms
Wall time: 181 ms


### 5.4. ExtraTreesClassifier

In [23]:
%%time
# ExtraTreesClassifier

clf = ExtraTreesClassifier().fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_extra_tree = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.68      0.94      0.79       590
    positive       0.65      0.20      0.30       321

    accuracy                           0.68       911
   macro avg       0.67      0.57      0.55       911
weighted avg       0.67      0.68      0.62       911

CPU times: user 2.56 s, sys: 36.3 ms, total: 2.6 s
Wall time: 2.71 s


### 5.5. KNeighbors Classifier

In [24]:
%%time
# KNeighborsClassifier

clf = KNeighborsClassifier().fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_knn = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.68      0.83      0.75       590
    positive       0.48      0.29      0.36       321

    accuracy                           0.64       911
   macro avg       0.58      0.56      0.55       911
weighted avg       0.61      0.64      0.61       911

CPU times: user 466 ms, sys: 37 ms, total: 504 ms
Wall time: 545 ms


### 5.6. LinearSVC

In [25]:
%%time
# LinearSVC  (setting multi_class=”crammer_singer”)

clf = LinearSVC(multi_class="crammer_singer").fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_linear_svc = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.65      1.00      0.79       590
    positive       0.86      0.02      0.04       321

    accuracy                           0.65       911
   macro avg       0.75      0.51      0.41       911
weighted avg       0.72      0.65      0.52       911

CPU times: user 142 ms, sys: 5.17 ms, total: 147 ms
Wall time: 180 ms




### 5.7. LogisticRegressionCV

In [26]:
%%time
# LogisticRegressionCV(setting multi_class=”multinomial”)

clf = LogisticRegressionCV(multi_class="multinomial").fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_logistic_cv = round(clf.score(X_test, y_test), 2)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

              precision    recall  f1-score   support

    negative       0.73      0.88      0.80       590
    positive       0.64      0.41      0.50       321

    accuracy                           0.71       911
   macro avg       0.69      0.64      0.65       911
weighted avg       0.70      0.71      0.69       911

CPU times: user 2.99 s, sys: 25.3 ms, total: 3.02 s
Wall time: 3.14 s


### 5.8. MLPClassifier

In [27]:
%%time
# MLPClassifier

clf = MLPClassifier().fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_mlp = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.74      0.85      0.79       590
    positive       0.61      0.44      0.51       321

    accuracy                           0.70       911
   macro avg       0.67      0.64      0.65       911
weighted avg       0.69      0.70      0.69       911

CPU times: user 7.47 s, sys: 574 ms, total: 8.04 s
Wall time: 4.18 s




### 5.9. Random Forest Classifier

In [28]:
%%time
# RandomForestClassifier()

clf = RandomForestClassifier().fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_random_forest = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.69      0.92      0.79       590
    positive       0.63      0.26      0.37       321

    accuracy                           0.68       911
   macro avg       0.66      0.59      0.58       911
weighted avg       0.67      0.68      0.64       911

CPU times: user 1.85 s, sys: 26.8 ms, total: 1.88 s
Wall time: 1.83 s


### 5.10. Ridge Classifier

In [29]:
%%time
# RidgeClassifier

clf = RidgeClassifier().fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_ridge = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.70      0.92      0.80       590
    positive       0.65      0.28      0.39       321

    accuracy                           0.69       911
   macro avg       0.68      0.60      0.59       911
weighted avg       0.68      0.69      0.65       911

CPU times: user 51.8 ms, sys: 3.71 ms, total: 55.5 ms
Wall time: 57.5 ms


### 5.11. RidgeClassifierCV

In [30]:
%%time
# RidgeClassifierCV

clf = RidgeClassifier().fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_ridge_cv = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.70      0.92      0.80       590
    positive       0.65      0.28      0.39       321

    accuracy                           0.69       911
   macro avg       0.68      0.60      0.59       911
weighted avg       0.68      0.69      0.65       911

CPU times: user 49.4 ms, sys: 4.07 ms, total: 53.4 ms
Wall time: 53.7 ms


### 5.12. SVC

In [31]:
%%time
# SVC

clf = SVC().fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_svc = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.66      0.99      0.79       590
    positive       0.74      0.05      0.10       321

    accuracy                           0.66       911
   macro avg       0.70      0.52      0.44       911
weighted avg       0.69      0.66      0.55       911

CPU times: user 831 ms, sys: 11.8 ms, total: 843 ms
Wall time: 865 ms


### 5.13. Gradient Boosting Classifier

In [32]:
%%time
# GradientBoostingClassifier

clf = GradientBoostingClassifier().fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_gbc = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.72      0.89      0.79       590
    positive       0.64      0.36      0.46       321

    accuracy                           0.70       911
   macro avg       0.68      0.63      0.63       911
weighted avg       0.69      0.70      0.68       911

CPU times: user 2.29 s, sys: 8.94 ms, total: 2.3 s
Wall time: 2.33 s


### 5.14. LinearSVC

In [33]:
%%time
# LinearSVC

clf = LinearSVC(multi_class = "ovr").fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_linear_svc2 = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.71      0.90      0.79       590
    positive       0.63      0.33      0.43       321

    accuracy                           0.70       911
   macro avg       0.67      0.61      0.61       911
weighted avg       0.68      0.70      0.67       911

CPU times: user 69 ms, sys: 3.1 ms, total: 72.1 ms
Wall time: 75.4 ms


### 5.15. LogisticRegression

In [34]:
%%time
# LogisticRegression multi_class=”ovr”

clf = LogisticRegression(multi_class="ovr").fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_logistic_reg = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.66      0.98      0.79       590
    positive       0.72      0.09      0.16       321

    accuracy                           0.67       911
   macro avg       0.69      0.54      0.48       911
weighted avg       0.69      0.67      0.57       911

CPU times: user 68.1 ms, sys: 1.97 ms, total: 70.1 ms
Wall time: 73.4 ms


### 5.16. SGDClassifier

In [35]:
%%time
# SGDClassifier

clf = SGDClassifier().fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_sgd = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.69      0.93      0.79       590
    positive       0.65      0.25      0.36       321

    accuracy                           0.69       911
   macro avg       0.67      0.59      0.57       911
weighted avg       0.68      0.69      0.64       911

CPU times: user 46.1 ms, sys: 2.36 ms, total: 48.5 ms
Wall time: 58.4 ms


### 5.17. Perceptron

In [36]:
%%time
# Perceptron

clf = Perceptron().fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_perceptron = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.83      0.40      0.54       590
    positive       0.44      0.85      0.58       321

    accuracy                           0.56       911
   macro avg       0.63      0.63      0.56       911
weighted avg       0.69      0.56      0.55       911

CPU times: user 37.5 ms, sys: 1.98 ms, total: 39.5 ms
Wall time: 49.2 ms


### 5.18. Passive Aggressive Classifier

In [37]:
%%time
# PassiveAggressiveClassifier

clf = PassiveAggressiveClassifier().fit(X_train, y_train)
model_scoring(clf, X_test, y_test)

acc_pac = round(clf.score(X_test, y_test), 2)

              precision    recall  f1-score   support

    negative       0.71      0.92      0.80       590
    positive       0.67      0.31      0.42       321

    accuracy                           0.70       911
   macro avg       0.69      0.61      0.61       911
weighted avg       0.69      0.70      0.67       911

CPU times: user 40.6 ms, sys: 2.08 ms, total: 42.7 ms
Wall time: 46.6 ms


### 5.19. Model evaluation

In [38]:
# evaluation

model_results = pd.DataFrame({
    'Models': [
        'BernoulliNB',
        'Decision Tree',
        'Extra Tree',
        'KNN',
        'Linear SVC',
        'Logistic Regression CV',
        'MLP',
        'Random Forest',
        'Ridge',
        'Ridge CV',
        'SVC',
        'GBC',
        'Linear SVC 2',
        'Logistic Regression',
        'SGDC',
        'Perceptron',
        'PAC'
    ],
    'Scores': [
        acc_bernouli_nb,
        acc_decision_tree,
        acc_extra_tree,
        acc_knn,
        acc_linear_svc,
        acc_logistic_cv,
        acc_mlp,
        acc_random_forest,
        acc_ridge,
        acc_ridge_cv,
        acc_svc,
        acc_gbc,
        acc_linear_svc2,
        acc_logistic_reg,
        acc_sgd,
        acc_perceptron,
        acc_pac
    ]
})
model_results.sort_values(by='Scores', ascending=False)

Unnamed: 0,Models,Scores
5,Logistic Regression CV,0.71
16,PAC,0.7
12,Linear SVC 2,0.7
11,GBC,0.7
6,MLP,0.7
9,Ridge CV,0.69
14,SGDC,0.69
8,Ridge,0.69
7,Random Forest,0.68
2,Extra Tree,0.68
