## Requirements


#### Directory adjustment


In [1]:
from pathlib import Path
import sys
import os

# Back to main folder
path = os.path.dirname(os.getcwd()) + "/"
os.chdir(path)
sys.path.append(path)


#### Charts


In [2]:
from IPython.display import SVG, display
import matplotlib.pyplot as plt
import seaborn as sns




#### Data Processing


In [3]:
# ETL
import numpy as np
import pandas as pd
import multiprocessing

# ML preprocessing
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.feature_selection import chi2


#### Natural language processing


In [4]:
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
import gensim


In [5]:
from src.TextVectorization import MeanEmbeddingVectorizer

# import spacy
import re

# Vectorization
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


#### Models


In [6]:
# Pipe
from sklearn.pipeline import Pipeline

# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn import svm


# Metrics
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report


In [7]:
from gensim.models import KeyedVectors
from gensim import models
import gensim


## Set and split train and test data


In [8]:
# Get data
df = pd.read_csv("data/corpus/augmented_corpus_fortuna.csv")
df.head(2)


Unnamed: 0,text_nonstop,text_lemma,text,length_text_nonstop,length_text_lemma,length_text,label,count_word_text_nonstop,count_word_text_lemma,count_word_text,...,pron,adp,aux,cconj,num,space,intj,sym,punct,part
0,cara vive outro mundo nao mundo real refugiado...,caro viver outro mundo nao mundo real refugiad...,nomeusuario o cara vive em outro mundo nao no ...,85,82,124,1,19,19,20,...,0,0,0,0,0,0,0,0,0,0
1,incompetentes nao cuidam povo brasileiro pouco...,incompetente nao cuidar povo brasileiro pouco ...,nomeusuario estes incompetentes nao cuidam nem...,69,66,108,0,20,20,20,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Set target and features
target = "label"
features = "text_nonstop"

# Break apart dataset
X = df[features].values.astype("U")
y = df[target]

# Split train abd test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Set k-fold criteria
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)


## Word2Vec 


### Training your own word2vec embedding model

In [10]:
# Define a corpus
corpus = X
cores = multiprocessing.cpu_count()

# Train a own word2vec model
own_model = gensim.models.Word2Vec(
    corpus,
    vector_size=50,
    window=4,
    min_count=10,
    sg=1,
    workers=cores - 1,
    batch_words=10000,
    alpha=0.1,
    min_alpha=0.0001,
    negative=20,
)

# Make embedding dictionary {token:vector}
own_w2v = dict(zip(own_model.wv.index_to_key, own_model.wv.vectors))


In [11]:
# Basic pipeline
clf_own = Pipeline(
    [
        ("vectorizer", MeanEmbeddingVectorizer(own_w2v)),
        (
            "classifier",
            DecisionTreeClassifier(random_state=42, class_weight={0: 1, 1: 1.5}),
        ),
    ]
)

# Train
clf_own.fit(X_train, y_train)

pd.DataFrame(classification_report(y_test, clf_own.predict(X_test), output_dict=True)).T


Unnamed: 0,precision,recall,f1-score,support
0,0.729585,0.701416,0.715223,777.0
1,0.400517,0.434174,0.416667,357.0
accuracy,0.617284,0.617284,0.617284,0.617284
macro avg,0.565051,0.567795,0.565945,1134.0
weighted avg,0.625989,0.617284,0.621233,1134.0


### Using a pre-trained model


In [12]:
# Load a pre-trained model
pretrained_model = models.KeyedVectors.load_word2vec_format(
    "data/pretrained-skipgram/skip_s50.txt", binary=False
)
pretrained_w2v = dict(zip(pretrained_model.index_to_key, pretrained_model.vectors))


In [13]:
# Basic pipeline
clf_pretrained = Pipeline(
    [
        ("word2vec vectorizer", MeanEmbeddingVectorizer(pretrained_w2v)),
        (
            "extra trees",
            DecisionTreeClassifier(random_state=42, class_weight={0: 1, 1: 1.5}),
        ),
    ]
)

# Train
clf_pretrained.fit(X_train, y_train)

pd.DataFrame(
    classification_report(y_test, clf_pretrained.predict(X_test), output_dict=True)
).T



Unnamed: 0,precision,recall,f1-score,support
0,0.725032,0.719434,0.722222,777.0
1,0.399449,0.406162,0.402778,357.0
accuracy,0.620811,0.620811,0.620811,0.620811
macro avg,0.562241,0.562798,0.5625,1134.0
weighted avg,0.622534,0.620811,0.621656,1134.0


In [14]:
# Load a pre-trained model
pretrained_model = models.KeyedVectors.load_word2vec_format(
    "data/pretrained-glove/glove_s300.txt", binary=False
)
pretrained_glove = dict(zip(pretrained_model.index_to_key, pretrained_model.vectors))


In [15]:
# Basic pipeline
clf_pretrained = Pipeline(
    [
        ("word2vec vectorizer", MeanEmbeddingVectorizer(pretrained_glove)),
        (
            "extra trees",
            DecisionTreeClassifier(random_state=42, class_weight={0: 1, 1: 1.5}),
        ),
    ]
)

# Train
clf_pretrained.fit(X_train, y_train)

pd.DataFrame(
    classification_report(y_test, clf_pretrained.predict(X_test), output_dict=True)
).T


Unnamed: 0,precision,recall,f1-score,support
0,0.715776,0.706564,0.71114,777.0
1,0.378747,0.389356,0.383978,357.0
accuracy,0.606702,0.606702,0.606702,0.606702
macro avg,0.547261,0.54796,0.547559,1134.0
weighted avg,0.609674,0.606702,0.608144,1134.0
