## Requirements


#### Directory adjustment


In [1]:
from pathlib import Path
import sys
import os
# Back to main folder
path = os.path.dirname(os.getcwd())+"/"
os.chdir(path)
sys.path.append(path)


#### Charts


In [2]:
from IPython.display import SVG, display
import matplotlib.pyplot as plt
from spacy import displacy
import seaborn as sns


#### Data Processing


In [3]:
# ETL
import numpy as np
import pandas as pd
# ML preprocessing
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold


#### Natural language processing


In [4]:
import spacy
import re 
# Vectorization
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import KeyedVectors


#### Models


In [5]:
# Pipe
from sklearn.pipeline import Pipeline
# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn import svm
# Metrics
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report


## Set and split train and test data


In [6]:
# Get data
df = pd.read_csv('data/augmented_corpus_fortuna.csv')
df.head(2)


Unnamed: 0,text_nonstop,text_lemma,text,length_text_nonstop,length_text_lemma,length_text,label,count_word_text_nonstop,count_word_text_lemma,count_word_text,...,pron,adp,aux,cconj,num,space,intj,sym,punct,part
0,cara vive outro mundo nao mundo real refugiado...,caro viver outro mundo nao mundo real refugiad...,nomeusuario o cara vive em outro mundo nao no ...,85,82,124,1,19,19,20,...,0,0,0,0,0,0,0,0,0,0
1,incompetentes nao cuidam povo brasileiro pouco...,incompetente nao cuidar povo brasileiro pouco ...,nomeusuario estes incompetentes nao cuidam nem...,69,66,108,0,20,20,20,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Set target and features
target = 'label'
features = 'text_nonstop'

# Break apart dataset
X = df[features].values.astype('U')
y = df[target]

# Split train abd test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42)

# Set k-fold criteria
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)


## BOW and TF-IDF


In [8]:
corpus = X
vectorizer = TfidfVectorizer(lowercase=False)
X = vectorizer.fit_transform(corpus)

#### Basic structure


In [9]:
# Pipe
clf = Pipeline([('tfidf', TfidfVectorizer(lowercase=False,
                                          analyzer="word",
                                          norm='l2',
                                          ngram_range=(1, 3),
                                          max_features=100)),
                ('clf', MultinomialNB())])

# Train
clf.fit(X_train, y_train)

# Evaluate
# score_dict_1 = classification_report(y_test, clf.predict(X_test), output_dict=True)

print(clf['clf'])
pd.DataFrame(classification_report(
    y_test, clf.predict(X_test), output_dict=True)).T


MultinomialNB()


Unnamed: 0,precision,recall,f1-score,support
0,0.742227,0.952381,0.834273,777.0
1,0.729927,0.280112,0.404858,357.0
accuracy,0.740741,0.740741,0.740741,0.740741
macro avg,0.736077,0.616246,0.619566,1134.0
weighted avg,0.738355,0.740741,0.699087,1134.0


#### Model evaluation

In [19]:

classifiers = [MultinomialNB(),
               BernoulliNB(),
               LinearSVC(),
               LogisticRegression(penalty='l2', max_iter=200, C=1),
               SGDClassifier(loss='hinge', max_iter=200),
               DecisionTreeClassifier(random_state=42, class_weight={0: 1, 1: 1.5}),
               RandomForestClassifier(random_state=42,class_weight={0: 1, 1: 1.5}),
               svm.SVC(kernel='rbf')]

score_list = []

for clf in classifiers:
    pipe = Pipeline([('tfidf', TfidfVectorizer(lowercase=False,
                                               analyzer="word",
                                               norm='l2',
                                               ngram_range=(1, 2),
                                               max_features=100)),
                    ('clf', clf)])

    # Train
    pipe.fit(X_train, y_train)

    # Evaluate
    temp_socores = classification_report(
        y_test, pipe.predict(X_test),
        output_dict=True)

    temp_socores['model'] = re.sub('[^A-Za-z0-9]+', '', str(pipe['clf']))
    score_list.append(temp_socores)



In [20]:
scores_df = pd.DataFrame()
for score in score_list:
    scores_df = pd.concat([scores_df, pd.DataFrame(score)])

scores_df

Unnamed: 0,0,1,accuracy,macro avg,weighted avg,model
precision,0.742258,0.744361,0.742504,0.743309,0.74292,MultinomialNB
recall,0.956242,0.277311,0.742504,0.616776,0.742504,MultinomialNB
f1-score,0.835771,0.404082,0.742504,0.619926,0.699868,MultinomialNB
support,777.0,357.0,0.742504,1134.0,1134.0,MultinomialNB
precision,0.757158,0.670157,0.742504,0.713658,0.729769,BernoulliNB
recall,0.918919,0.358543,0.742504,0.638731,0.742504,BernoulliNB
f1-score,0.830233,0.467153,0.742504,0.648693,0.71593,BernoulliNB
support,777.0,357.0,0.742504,1134.0,1134.0,BernoulliNB
precision,0.752332,0.698225,0.744268,0.725278,0.735298,LinearSVC
recall,0.934363,0.330532,0.744268,0.632448,0.744268,LinearSVC
