In [26]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
    border: 1px grey solid !important;
  color: white !important;
}
</style>

In [27]:
import os
import re
import time
import collections
import emoji
import demoji
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# processing of text
import stanza
import spacy
import text_functions as text_f
from stopwords import stopwords
from pyMorfologik import Morfologik
from autocorrect import Speller

#plot
from plot_functions import plot_word_cloud, count_plot_words

# models
import torch
# from lazypredict.Supervised import LazyClassifier
import xgboost
from model_functions import train_validation_test_split
from model_functions import split_train_val_test
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB


In [28]:
# setting display
pd.options.mode.chained_assignment = None
pd.options.display.max_colwidth = 300
pd.set_option("display.colheader_justify","left")
%matplotlib inline

# load and download
demoji.download_codes()
spell = Speller('pl')
lemma_spacy = spacy.load('pl_spacy_model')

Downloading emoji data ...
... OK (Got response in 0.56 seconds)
Writing emoji data to C:\Users\patry\.demoji\codes.json ...
... OK


In [29]:
# doc = lemma_spacy("aaaaaaa dlacgego wiedddddiec wiefdieć")
# for token in doc:
#     print(token.text, token.lemma_, token.pos_)

In [30]:
# stanza.download(lang="pl")
# nlp = stanza.Pipeline(lang='pl', processors='tokenize,mwt,pos,lemma')
# doc = nlp('Jak czemu doszlo coś proooooba aaaaaaaaaaaa.')
# print(*[f'word: {word.text+" "}\tlemma: {word.lemma}' for sent in doc.sentences for word in sent.words], sep='\n')

In [171]:
data_poleval_raw = pd.read_csv('Labeled_data/converted_label_poleval.txt', 
                           error_bad_lines=False, sep=",")
data_poleval_test_raw = pd.read_csv('Labeled_data/converted_label_poleval_test.txt', 
                           error_bad_lines=False, sep=",")
data_github_raw = pd.read_csv('Labeled_data/labeled_dataset.txt', 
                          error_bad_lines=False, sep="|")

data_poleval = data_poleval_raw.copy(deep=True)
data_poleval_test = data_poleval_test_raw.copy(deep=True)
data_github = data_github_raw.copy(deep=True)
datasets = [data_poleval, data_poleval_test, data_github]

conected_data = pd.concat([data_poleval, 
                           data_poleval_test, 
                           data_github.drop(['No', 'Yes'], 
                                            axis=1)], 
                          axis=0, 
                          ignore_index=True)

In [172]:
conected_data = conected_data.head(10)

In [173]:
conected_data

Unnamed: 0,Comment,Kind of offensive language
0,"Dla mnie faworytem do tytułu będzie Cracovia. Zobaczymy, czy typ się sprawdzi.",0
1,@anonymized_account @anonymized_account Brawo ty Daria kibic ma być na dobre i złe,0
2,"@anonymized_account @anonymized_account Super, polski premier składa kwiaty na grobach kolaborantów. Ale doczekaliśmy czasów.",0
3,@anonymized_account @anonymized_account Musi. Innej drogi nie mamy.,0
4,"Odrzut natychmiastowy, kwaśna mina, mam problem",0
5,"Jaki on był fajny xdd pamiętam, że spóźniłam się na jego pierwsze zajęcia i to sporo i za karę kazał mi usiąść w pierwszej ławce XD",0
6,@anonymized_account No nie ma u nas szczęścia 😉,0
7,@anonymized_account Dawno kogoś tak wrednego nie widziałam xd,0
8,"@anonymized_account @anonymized_account Zaległości były, ale ważne czy były wezwania do zapłaty z których się klub nie wywiązał.",0
9,@anonymized_account @anonymized_account @anonymized_account Gdzie jest @anonymized_account . Brudziński jesteś kłamcą i marnym kutasem @anonymized_account,1


In [114]:
conected_data['Comment'] = conected_data['Comment'].apply(lambda x: [word.lemma_ for word in lemma_spacy(x)] for x) 

SyntaxError: invalid syntax (<ipython-input-114-78e0dc1eb159>, line 1)

In [115]:
conected_data['Comment'] = conected_data['Comment'].apply(lambda x: [token.lemma_ for token in lemma_spacy(x)])

In [170]:
conected_data

Unnamed: 0,Comment,Kind of offensive language
0,"[faworyta, tytuł, cracovia, zobaczyć, typ, sprawdzić]",0
1,"[brawo, daria, kibic, dobry, zły]",0
2,"[super, polski, premiera, składać, kwiat, grób, kolaborant, doczekać, czas]",0
3,"[inny, droga, mama]",0
4,"[odrzut, natychmiastowy, kwaśny, mina, problem]",0
5,"[fajny, xdd, pamiętać, spóźnić, pierwszy, zajęcie, sporo, kara, kazać, usiąść, pierwszy, ławka]",0
6,[szczęście],0
7,"[dawno, ktoś, wredny, widziałam]",0
8,"[zaległość, ważny, wezwanie, zapłata, kluba, wywiązać]",0
9,"[budziński, być, kłamca, marny, kutas]",1


In [190]:
class Preprocessing:
    def __init__(self):
#         self.data = args.dataset_csv
        self.vocabulary = None
        self.x_tokenized = None
        self.x_raw = None
        self.y = None

        self.x_train = None
        self.x_val = None
        self.x_test = None
        self.y_train = None
        self.y_val = None
        self.y_test = None

    def load_data(self):
        data_poleval_raw = pd.read_csv(
            'Labeled_data/converted_label_poleval.txt',
            error_bad_lines=False, sep=",")
        data_poleval_test_raw = pd.read_csv(
            'Labeled_data/converted_label_poleval_test.txt',
            error_bad_lines=False, sep=",")
        data_github_raw = pd.read_csv('Labeled_data/labeled_dataset.txt',
                                      error_bad_lines=False, sep="|")

        data_poleval = data_poleval_raw.copy(deep=True)
        data_poleval_test = data_poleval_test_raw.copy(deep=True)
        data_github = data_github_raw.copy(deep=True)

        conected_data = pd.concat([data_poleval,
                                   data_poleval_test,
                                   data_github.drop(['No', 'Yes'],
                                                    axis=1)],
                                  axis=0,
                                  ignore_index=True)
        self.x_raw = conected_data['Comment'].head(10)
        self.y = conected_data['Kind of offensive language'].head()

    def remove_quoting_comments(self):
        pattern = r'^RT.*'
        remove = self.x_raw.str.contains(pattern)
        self.x_raw = self.x_raw[~remove].reset_index(drop=True)
    
    def demojize(self):
        self.x_raw = self.x_raw.apply(lambda x: emoji.demojize(x,  delimiters=("~~", "~~")))
    
    def clean_text(self, stopwords_remove = False):
        # remove of @name
        pattern = re.compile(r'@\w+[\s]*')
        self.x_raw = self.x_raw.str.replace(pattern, '')

        # split emoji
        pattern = re.compile(r"~{2}")
        self.x_raw = self.x_raw.str.replace(pattern, ' ')

        # remove of links https
        pattern = re.compile(r"https?[:\/\/]+[a-zA-Z0-9.\-\/?=_~:#%]+")
        self.x_raw = self.x_raw.str.replace(pattern, '')

        # removal of punctuations and numbers
        pattern = re.compile(r'[^_ąćęłńóśźżĄĆĘŁŃÓŚŹŻa-zA-Z\s]')
        self.x_raw = self.x_raw.str.replace(pattern, '')

        # remove more than one space
        pattern = re.compile(r'\s+')
        self.x_raw = self.x_raw.str.replace(pattern, ' ')

        # remove beginning and ending task space
        pattern = re.compile(r'^\s+|\s+?$')
        self.x_raw = self.x_raw.str.replace(pattern, '')

        # remove of capitalization
        self.x_raw = self.x_raw.str.lower()
        
        # remove stopwords
    def stopwords_remove(self):
        self.x_raw = self.x_raw.apply(lambda x: x.split())
        self.x_raw = self.x_raw.apply(
            lambda x: [item for item in x if
                       item not in stopwords and len(item) >= 3])
        for i in range(len(self.x_raw)):
            self.x_raw[i] = ' '.join(self.x_raw[i])
                
    def lemmatize_text(self):
        self.x_raw = self.x_raw.apply(lambda x: [token.lemma_ for token in lemma_spacy(x)])
        
    def correct_typo_words(self):
        self.dict_words_place = collections.defaultdict(list)
        for _, row_list in self.x_raw.items():
            for place_in_row in range(len(row_list)):
                self.dict_words_place[row_list[place_in_row]].append((_, place_in_row))
    
        with open('slowa.txt', encoding='utf-8') as file:
            contents = file.read()
            for word, place_word in self.dict_words_place.items():
                if word in contents:
                    continue
                else:
                    for token in lemma_spacy(spell(word)):
                        correct = token.lemma_
                    for number in place_word:
                        self.x_raw[number[0]][number[1]] = correct

In [191]:
data = Preprocessing()

In [192]:
data.load_data()

In [193]:
data.load_data()
data.remove_quoting_comments()
data.demojize()
data.clean_text()
data.stopwords_remove()
data.lemmatize_text()
data.correct_typo_words()

In [194]:
data.x_raw

0                                         faworytem tytułu cracovia zobaczymy typ sprawdzi
1                                                              brawo daria kibic dobre złe
2              super polski premier składa kwiaty grobach kolaborantów doczekaliśmy czasów
3                                                                         innej drogi mamy
4                                                odrzut natychmiastowy kwaśna mina problem
5    fajny xdd pamiętam spóźniłam pierwsze zajęcia sporo karę kazał usiąść pierwszej ławce
6                                                                   szczęścia winking_face
7                                                           dawno kogoś wrednego widziałam
8                                          zaległości ważne wezwania zapłaty klub wywiązał
9                                                  brudziński jesteś kłamcą marnym kutasem
Name: Comment, dtype: object

In [195]:
data.lemmatize_text()

In [196]:
data.x_raw

0                                                 [faworyt, tytuł, cracovia, zobaczyć, typ, sprawdzić]
1                                                                    [brawo, daria, kibic, dobry, zły]
2                          [super, polski, premiera, składać, kwiat, grób, kolaborant, doczekać, czas]
3                                                                                  [inny, droga, mama]
4                                                      [odrzut, natychmiastowy, kwaśny, mina, problem]
5    [fajny, xdd, pamiętać, spóźniłam, pierwszy, zajęcie, sporo, kara, kazać, usiąść, pierwszy, ławka]
6                                                                            [szczęście, winking_face]
7                                                                     [dawno, ktoś, wredny, widziałam]
8                                               [zaległość, ważny, wezwanie, zapłata, kluba, wywiązać]
9                                                              [brudzińsk

In [197]:
data.correct_typo_words()

In [198]:
data.x_raw

0                                              [faworyta, tytuł, cracovia, zobaczyć, typ, sprawdzić]
1                                                                  [brawo, daria, kibic, dobry, zły]
2                        [super, polski, premiera, składać, kwiat, grób, kolaborant, doczekać, czas]
3                                                                                [inny, droga, mama]
4                                                    [odrzut, natychmiastowy, kwaśny, mina, problem]
5    [fajny, xdd, pamiętać, spóźnić, pierwszy, zajęcie, sporo, kara, kazać, usiąść, pierwszy, ławka]
6                                                                          [szczęście, winning_face]
7                                                                   [dawno, ktoś, wredny, widziałam]
8                                             [zaległość, ważny, wezwanie, zapłata, kluba, wywiązać]
9                                                             [budziński, być, kłamca, marn

In [6]:
conected_data = text_f.remove_quoting_comments(conected_data)

In [7]:
conected_data['Comment'] = conected_data['Comment'].apply(lambda x: emoji.demojize(x,  delimiters=("~~", "~~")))
# conected_data['Comment'] = text_f.preprocess_text(conected_data['Comment'])

In [120]:
start = time.time()
conected_data['Comment'] = preprocess_text(conected_data['Comment'], stopwords_remove=True)
conected_data['Comment'] = lemmatize_text(conected_data['Comment'])
end = time.time()
print(end-start)

0.17422771453857422


In [174]:
dict_words_place = collections.defaultdict(list)
for _, row_list in conected_data['Comment'].items():
    for place_in_row in range(len(row_list)):
        print(row_list[place_in_row])

D
l
a
 
m
n
i
e
 
f
a
w
o
r
y
t
e
m
 
d
o
 
t
y
t
u
ł
u
 
b
ę
d
z
i
e
 
C
r
a
c
o
v
i
a
.
 
Z
o
b
a
c
z
y
m
y
,
 
c
z
y
 
t
y
p
 
s
i
ę
 
s
p
r
a
w
d
z
i
.
@
a
n
o
n
y
m
i
z
e
d
_
a
c
c
o
u
n
t
 
@
a
n
o
n
y
m
i
z
e
d
_
a
c
c
o
u
n
t
 
B
r
a
w
o
 
t
y
 
D
a
r
i
a
 
k
i
b
i
c
 
m
a
 
b
y
ć
 
n
a
 
d
o
b
r
e
 
i
 
z
ł
e
@
a
n
o
n
y
m
i
z
e
d
_
a
c
c
o
u
n
t
 
@
a
n
o
n
y
m
i
z
e
d
_
a
c
c
o
u
n
t
 
S
u
p
e
r
,
 
p
o
l
s
k
i
 
p
r
e
m
i
e
r
 
s
k
ł
a
d
a
 
k
w
i
a
t
y
 
n
a
 
g
r
o
b
a
c
h
 
k
o
l
a
b
o
r
a
n
t
ó
w
.
 
A
l
e
 
d
o
c
z
e
k
a
l
i
ś
m
y
 
c
z
a
s
ó
w
.
@
a
n
o
n
y
m
i
z
e
d
_
a
c
c
o
u
n
t
 
@
a
n
o
n
y
m
i
z
e
d
_
a
c
c
o
u
n
t
 
M
u
s
i
.
 
I
n
n
e
j
 
d
r
o
g
i
 
n
i
e
 
m
a
m
y
.
O
d
r
z
u
t
 
n
a
t
y
c
h
m
i
a
s
t
o
w
y
,
 
k
w
a
ś
n
a
 
m
i
n
a
,
 
m
a
m
 
p
r
o
b
l
e
m
J
a
k
i
 
o
n
 
b
y
ł
 
f
a
j
n
y
 
x
d
d
 
p
a
m
i
ę
t
a
m
,
 
ż
e
 
s
p
ó
ź
n
i
ł
a
m
 
s
i
ę
 
n
a
 
j
e
g
o
 
p
i
e
r
w
s
z
e
 
z
a
j
ę
c
i
a
 
i
 
t
o
 
s
p
o
r
o
 
i
 
z
a
 
k
a
r
ę
 
k
a
z
a
ł
 


In [164]:
dict_words_place = collections.defaultdict(list)
for _, row_list in conected_data['Comment'].items():
    for place_in_row in range(len(row_list)):
        dict_words_place[row_list[place_in_row]].append((_, place_in_row))

In [165]:
dict_words_place

defaultdict(list,
            {'faworyt': [(0, 0)],
             'tytuł': [(0, 1)],
             'cracovia': [(0, 2)],
             'zobaczyć': [(0, 3)],
             'typ': [(0, 4)],
             'sprawdzić': [(0, 5)],
             'brawo': [(1, 0)],
             'daria': [(1, 1)],
             'kibic': [(1, 2)],
             'dobry': [(1, 3)],
             'zły': [(1, 4)],
             'super': [(2, 0)],
             'polski': [(2, 1)],
             'premiera': [(2, 2)],
             'składać': [(2, 3)],
             'kwiat': [(2, 4)],
             'grób': [(2, 5)],
             'kolaborant': [(2, 6)],
             'doczekać': [(2, 7)],
             'czas': [(2, 8)],
             'inny': [(3, 0)],
             'droga': [(3, 1)],
             'mama': [(3, 2)],
             'odrzut': [(4, 0)],
             'natychmiastowy': [(4, 1)],
             'kwaśny': [(4, 2)],
             'mina': [(4, 3)],
             'problem': [(4, 4)],
             'fajny': [(5, 0)],
             'xdd': [(5,

In [166]:
with open('slowa.txt', encoding='utf-8') as file:
    contents = file.read()
    for word, place_word in dict_words_place.items():
        if word in contents:
            continue
        else:
            for token in lemma_spacy(spell(word)):
                correct = token.lemma_
            for number in place_word:
                conected_data['Comment'][number[0]][number[1]] = correct

In [167]:
conected_data

Unnamed: 0,Comment,Kind of offensive language
0,"[faworyta, tytuł, cracovia, zobaczyć, typ, sprawdzić]",0
1,"[brawo, daria, kibic, dobry, zły]",0
2,"[super, polski, premiera, składać, kwiat, grób, kolaborant, doczekać, czas]",0
3,"[inny, droga, mama]",0
4,"[odrzut, natychmiastowy, kwaśny, mina, problem]",0
5,"[fajny, xdd, pamiętać, spóźnić, pierwszy, zajęcie, sporo, kara, kazać, usiąść, pierwszy, ławka]",0
6,[szczęście],0
7,"[dawno, ktoś, wredny, widziałam]",0
8,"[zaległość, ważny, wezwanie, zapłata, kluba, wywiązać]",0
9,"[budziński, być, kłamca, marny, kutas]",1


In [168]:
dict_words_place = collections.defaultdict(list)
for _, row_list in conected_data.iterrows():
    for place_in_row in range(len(row_list[0])):
        dict_words_place[row_list[0][place_in_row]].append((_, place_in_row))

In [169]:
dict_words_place

defaultdict(list,
            {'faworyta': [(0, 0)],
             'tytuł': [(0, 1)],
             'cracovia': [(0, 2)],
             'zobaczyć': [(0, 3)],
             'typ': [(0, 4)],
             'sprawdzić': [(0, 5)],
             'brawo': [(1, 0)],
             'daria': [(1, 1)],
             'kibic': [(1, 2)],
             'dobry': [(1, 3)],
             'zły': [(1, 4)],
             'super': [(2, 0)],
             'polski': [(2, 1)],
             'premiera': [(2, 2)],
             'składać': [(2, 3)],
             'kwiat': [(2, 4)],
             'grób': [(2, 5)],
             'kolaborant': [(2, 6)],
             'doczekać': [(2, 7)],
             'czas': [(2, 8)],
             'inny': [(3, 0)],
             'droga': [(3, 1)],
             'mama': [(3, 2)],
             'odrzut': [(4, 0)],
             'natychmiastowy': [(4, 1)],
             'kwaśny': [(4, 2)],
             'mina': [(4, 3)],
             'problem': [(4, 4)],
             'fajny': [(5, 0)],
             'xdd': [(5

In [12]:
start = time.time()
with open('slowa.txt', encoding='utf-8') as file:
    contents = file.read()
    for word, place_word in dict_words_place.items():
        if word in contents:
            continue
        else:
            for token in lemma_spacy(spell(word)):
                correct = token.lemma_
            for number in place_word:
                conected_data['Comment'][number[0]][number[1]] = correct
end = time.time()
print(end-start)

1558.814565896988


In [13]:
conected_data = conected_data[['Comment', 'Kind of offensive language']]
conected_data.to_csv('data_for_model.csv', index=False)

# Model

In [None]:
data = train_validation_test_split(conected_data)

In [None]:
data

In [None]:
test = data[data['split'] =='train'].iloc[:10][['Comment', 'Kind of offensive language']]
test['Comment'] = preprocess_text(test['Comment'], stopwords_remove=True)
# test['Comment'] = test['Comment'].apply(lambda x: x.split())
test

In [None]:
test['Comment'] = lemmatize_text(test['Comment'])

In [25]:
test

NameError: name 'test' is not defined

In [None]:
# test = data[data['split'] =='train'][['Comment', 'Kind of offensive language']]
# test['Comment'] = test['Comment'].apply(lambda x: x.split())
# test

In [None]:
# test['Comment'] = test['Comment'].apply(lambda x: [token.lemma_ for token in lemma_spacy(x)])

In [19]:
conected_data = split_train_val_test()

In [24]:
conected_data['Comment'] = conected_data['Comment'].apply(lambda x: ' '.join(x))

KeyError: 'Comment'

In [None]:
data[data['split'] =='val'].iloc[:10]['Kind of offensive language']

In [None]:
X_train = data[data['split'] =='train'].iloc[:10][['Comment']]
y_train =  data[data['split'] =='train'].iloc[:10]['Kind of offensive language']
X_test = data[data['split'] =='val'].iloc[:10][['Comment']]
y_test =  data[data['split'] =='val'].iloc[:10]['Kind of offensive language']
tfidf_vectorizer = TfidfVectorizer()

# TF-IDF feature matrix
X_train = tfidf_vectorizer.fit_transform(X_train)

In [None]:
X_train = data[data['split'] =='train'].iloc[:10][['Comment']]
y_train =  data[data['split'] =='train'].iloc[:10]['Kind of offensive language']
X_test = data[data['split'] =='val'].iloc[:10][['Comment']]
y_test =  data[data['split'] =='val'].iloc[:10]['Kind of offensive language']


tfidf_vectorizer = TfidfVectorizer()

# TF-IDF feature matrix
X_train = tfidf_vectorizer.fit_transform(X_train)
X_test = tfidf_vectorizer.transform(X_test)


# print(tfidf_vectorizer.get_feature_names())

In [None]:
X_train = data[data['split'] =='train']['Comment']
y_train =  data[data['split'] =='train']['Kind of offensive language']
X_test = data[data['split'] =='val']['Comment']
y_test =  data[data['split'] =='val']['Kind of offensive language']

vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
# print(vectorizer.get_feature_names())

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train) 

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

In [None]:
result = ''
the_best_result = ''
class Modeling:
    """Modeling and presentation of results"""

    def __init__(self, model, X_train, X_test, title):
        """Inicjalization"""

        self.model = model
        self.title = title
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.result = ''

    def fit_predict(self):
        """Function to train, predict our model and create roc curve"""
        self.classifier = self.model
        self.classifier.fit(self.X_train, self.y_train)
        self.y_pred = self.classifier.predict(self.X_test)
        self.y_pred_proba = self.classifier.predict_proba(self.X_test)[:, 1]

        self.y_train_proba = self.classifier.predict_proba(self.X_train)[:, 1]

    def print_results(self):
        """Function to print our result"""
        self.accuracy = round(accuracy_score(y_test, self.y_pred), 4)
        self.f1 = round(f1_score(y_test, self.y_pred, average='weighted'), 4)
        self.recall = round(recall_score(y_test, self.y_pred, average='weighted'), 4)
#         self.log_loss = round(log_loss(y_test, self.y_pred_proba), 4)

        print(f'Results for {self.title}:')
        print(f'{self.title} accuracy: {self.accuracy}')
        print(f'{self.title} f-score: {self.f1}')
        print(f'{self.title} recall: {self.recall}')
#         print(f'{self.title} log_loss: {self.log_loss}')

    def add_to_table(self):
        """Function to add our result to dataframe to compare all"""
        global result
        if len(result) == 0:
            result = {self.title: [self.accuracy, self.f1, self.recall]}
            result = pd.DataFrame(result, index=['Accuracy', 'F-score',
                                                 'Recall'])
        else:
            conact = {self.title: [self.accuracy, self.f1, self.recall,
                                   self.log_loss]}
            conact = pd.DataFrame(conact, index=['Accuracy', 'F-score',
                                                 'Recall'])
            result = pd.concat([result, conact], axis=1)

    def plot_confusion_matrix(self):
        """plot confusion matrix"""
        plt.figure(figsize=(10, 10), facecolor='w')
        sns.heatmap(confusion_matrix(y_test, self.y_pred), annot=True,
                    fmt='.0f',
                    cbar=False,
                    vmax=confusion_matrix(y_test, self.y_pred).max(),
                    vmin=0, cmap='Blues')
        plt.xlabel('Predicted label')
        plt.ylabel('True label')
        plt.title(f'Confusion matrix for {self.title}')

    def plot_confusion_matrix_percent(self):
        """Plot confusion matrix with part of 1 value"""
        plt.figure(figsize=(10, 10), facecolor='w')
        cm = confusion_matrix(y_test, self.y_pred)
        cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        df_cm = pd.DataFrame(cm_norm)
        sns.heatmap(df_cm, annot=True, cmap="Blues", cbar=False)
        plt.xlabel('Predicted label')
        plt.ylabel('True label')
        plt.title(f'Confusion matrix for {self.title}')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import precision_recall_curve, roc_curve, auc, log_loss

In [None]:
log_reg = Modeling(RandomForestClassifier(), 
                   X_train, X_test, 
                   'Logistic Regression')
log_reg.fit_predict()
log_reg.print_results()
log_reg.add_to_table()

In [None]:
log_reg.plot_confusion_matrix_percent()

In [None]:
result

In [None]:
result = ''
the_best_result = ''
class Modeling:
    """Modeling and presentation of results"""
    
    def __init__(self, model, X_train, X_test, title, pca=False):
        """Inicjalization"""
        
        self.model = model
        self.title = title
        self.X_sample = X_train
        self.y_sample = y_train
        self.X_test = X_test
        self.pca = pca
        self.result = ''
        
    def sample(self, sampling):
        self.sample = sampling
        self.X_sample, self.y_sample = self.sample.fit_resample(
            self.X_sample, self.y_sample)

    def fit_predict(self):
        """Function to train, predict our model and create roc curve"""
        self.classifier = self.model
        self.classifier.fit(self.X_sample, self.y_sample)
        self.y_pred = self.classifier.predict(self.X_test)
        self.y_pred_proba = self.classifier.predict_proba(self.X_test)[:, 1]
        if self.pca:
            self.y_train_proba = self.classifier.predict_proba(X_train_pca)[:, 1]
        else:
            self.y_train_proba = self.classifier.predict_proba(X_train)[:, 1]
        

    def print_results(self):
        """Function to print our result"""
        self.accuracy = round(accuracy_score(y_test, self.y_pred, 'weighted'), 4)
        self.f1 = round(f1_score(y_test, self.y_pred, average='weighted'), 4)
        self.recall = round(recall_score(y_test, self.y_pred, average='weighted'), 4)
        
        print(f'Results for {self.title}:')
        print(f'{self.title} accuracy: {self.accuracy}')
        print(f'{self.title} f-score: {self.f1}')
        print(f'{self.title} recall: {self.recall}')

    def add_to_table(self):
        """Function to add our result to dataframe to compare all"""
        global result
        if len(result) == 0:
            result  = {self.title: [self.accuracy, self.f1, self.recall
                                    ]}
            result = pd.DataFrame(result, index=['Accuracy', 'F-score', 
                                                 'Recall'])
        else:
            conact = {self.title: [self.accuracy, self.f1, self.recall, 
                                   ]}
            conact = pd.DataFrame(conact, index=['Accuracy', 'F-score', 
                                                 'Recall'])
            result = pd.concat([result, conact], axis=1) 

    def plot_confusion_matrix(self):
        """plot confusion matrix"""
        plt.figure(figsize=(10,10), facecolor='w')
        sns.heatmap(confusion_matrix(y_test, self.y_pred), annot=True, fmt='.0f', 
                    cbar=False, vmax=confusion_matrix(y_test, self.y_pred).max(), 
                    vmin=0, cmap='Blues')
        plt.xlabel('Predicted label')
        plt.ylabel('True label')
        plt.title(f'Confusion matrix for {self.title}')
    
    def plot_confusion_matrix_percent(self):
        """Plot confusion matrix with part of 1 value"""
        plt.figure(figsize=(10,10), facecolor='w')
        cm = confusion_matrix(y_test, self.y_pred)
        cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        df_cm = pd.DataFrame(cm_norm)
        sns.heatmap(df_cm, annot=True, cmap="Blues", cbar=False)
        plt.xlabel('Predicted label')
        plt.ylabel('True label')
        plt.title(f'Confusion matrix for {self.title}')

In [None]:
log_reg = Modeling(RandomForestClassifier(), 
                   X_train, X_test, 
                   'Logistic Regression')
log_reg.fit_predict()
log_reg.print_results()
log_reg.add_to_table()

In [None]:
log

In [None]:
# SMOTE for Logistic Regression

log_reg_smote = Modeling(LogisticRegression(), 
                         X_train, X_test,
                         'Logistic Regression SMOTE')
log_reg_smote.sample(SMOTE(sampling_strategy='minority'))
log_reg_smote.fit_predict()
log_reg_smote.print_results()
log_reg_smote.plot_confusion_matrix()
log_reg_smote.add_to_table()

In [None]:
! pip install imblearn
from imblearn.over_sampling import SMOTE

In [None]:
# SMOTE for Logistic Regression

log_reg_smote = Modeling(SVC(probability=True), 
                         X_train, X_test,
                         'Logistic Regression SMOTE')
log_reg_smote.sample(SMOTE(sampling_strategy='minority'))
log_reg_smote.fit_predict()
log_reg_smote.print_results()
log_reg_smote.plot_confusion_matrix()
log_reg_smote.add_to_table()

In [None]:
print(classification_report(y_test,log_reg.y_pred))

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
bayes = Modeling(MultinomialNB(), 
                         X_train, X_test,
                         'Bayes')
bayes.sample(SMOTE(sampling_strategy='minority'))
bayes.fit_predict()
bayes.print_results()
bayes.plot_confusion_matrix()
bayes.add_to_table()

In [None]:
from xgboost import XGBClassifier

In [None]:
oversample = SMOTE()
X_train_s, y_train_s = oversample.fit_resample(X_train, y_train)
train = xgboost.DMatrix(X_train_s, label=y_train_s)
test = xgboost.DMatrix(X_test, label=y_test)

In [None]:
param = {'max_depth':50, 'eta':0.5, 'objective': 'multi:softmax', 'num_class': 3}
epochs=100
bst = xgboost.train(param, train, epochs)
# make prediction
preds = bst.predict(test)
print(classification_report(y_test,preds))

In [None]:
from gensim.models import Word2Vec