In [3]:
import pandas as pd
import spacy
import math
import nltk
import string
import numpy as np
import seaborn as sns
#from wordcloud import WordCloud
import matplotlib.pyplot as plt

from spacy.lang.en import English
from collections import Counter
from copy import deepcopy

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

nlp = spacy.load('pt_core_news_md')

stop_words = spacy.lang.pt.stop_words.STOP_WORDS
punctuations = string.punctuation

In [4]:
def spacy_tokenizer(sentence, removeTopWords, lemmatize):

    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = nlp(str(sentence))

    # Lemmatizing each token and converting each token into lowercase
    if lemmatize:
        mytokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens]
    #mytokens = [ word.lemma_.lower().strip() for word in mytokens ]

    # Removing stop words
    if removeTopWords:
        mytokens = [word for word in mytokens if word not in stop_words]

    # Removing punctuations
    mytokens = [str(word) for word in mytokens if str(word) not in punctuations]
    # return preprocessed list of tokens
    return mytokens

In [5]:
def removeChars(dataFrame, hashtags):
    # Remove @ tags

    dataFrame.comment = dataFrame.comment.str.replace(r'(@\w*)', '', regex=True)

    # Remove URL
    dataFrame.comment = dataFrame.comment.str.replace(r"http\S+", "", regex=True)

    # Remove # tag
    if (hashtags):
        dataFrame.comment = dataFrame.comment.str.replace(r'#\w+', "", regex=True)
    # comp_df.tweet = comp_df.tweet.str.replace(r'#+',"")

    # Remove all non-character
    # comp_df.tweet = comp_df.tweet.str.replace(r"[^a-zA-Z ]","")

    # Remove extra space
    dataFrame.comment = dataFrame.comment.str.replace(r'( +)', " ", regex=True)
    dataFrame.comment = dataFrame.comment.str.strip()

    # Change to lowercase
    dataFrame.comment = dataFrame.comment.str.lower()

In [None]:
removeTopWords = True
hashtags = False
lemmatize = True
df = pd.read_excel("..\Dataset\dataset.xlsx")

msk = np.random.rand(len(df)) < 0.8

anger_train = df[msk].copy()
anger_test = df[~msk].copy()

print("*****************************************************************************************")
print("Run sendo executada com o dataSet 'Hate Speech Twitter annotations' que contem " + str(len(df)) + " texts")
print("Dos quais, " + str(len(anger_train)) + " São para treinamento e " + str(len(anger_test)) + " são para teste")
print("Os Argumentos dessa run são, RemoverTopWords: " + str(removeTopWords) + " Remover HashTags: " + str(hashtags) + " Lematizar: " + str(lemmatize))
print("*****************************************************************************************")

anger_train.loc[:, 'is_test'] = 0
anger_test.loc[:, 'is_test'] = 1

comp_df = pd.concat([anger_train, anger_test])
comp_df.reset_index(drop=True, inplace=True)

comp_df = comp_df[['txt', 'has_anger', 'is_test']]
comp_df.columns = ['comment', 'label', 'is_test']

comp_df['label'] = comp_df.label.replace('S', 1)
comp_df['label'] = comp_df.label.replace(math.nan, 0)

removeChars(comp_df, hashtags)

comp_df['corpus'] = [spacy_tokenizer(comment, removeTopWords, lemmatize) for comment in comp_df.comment]

count = Counter()

for cp in comp_df.corpus:
    count += Counter(cp)

print("As 5 palavras mais comuns do BOW são:")
print(count.most_common(5))

print(comp_df.label.unique())
for label in sorted(comp_df.label.unique()):
    corpusInLabel = comp_df.corpus[comp_df.label == label]

    count = Counter()
    for cp in corpusInLabel:
        count += Counter(cp)

    print("As 5 palavras mais comuns do Label " + str(label) + " são:")
    print(count.most_common(5))

print("As frequencias dos Labels são as seguintes:")
print(comp_df.label.value_counts())

comp_df.corpus = comp_df.apply(lambda x: " ".join(x.corpus), axis=1)
print(comp_df.head())

x_train = comp_df.corpus[comp_df.is_test == 0]
y_train = comp_df.label[comp_df.is_test == 0]
x_test = comp_df.corpus[comp_df.is_test == 1]
y_test = comp_df.label[comp_df.is_test == 1]

print("shape do treinamento")
print(x_train.shape)
print("shape do teste")
print(x_test.shape)

freq_vector = CountVectorizer(min_df=2, ngram_range=(1, 2)).fit(comp_df.corpus)

x_train = freq_vector.transform(x_train)
x_test = freq_vector.transform(x_test)

classifier = LogisticRegression(max_iter=500)

classifier.fit(x_train, y_train)

y_pred_train = classifier.predict(x_train)
print("FScore do classificador em cima do treinamento")
print(precision_recall_fscore_support(y_train, y_pred_train, average='macro', zero_division=True))

y_pred_test = classifier.predict(x_test)
print("FScore do classificador em cima do Teste")
print(precision_recall_fscore_support(y_test, y_pred_test, average='macro', zero_division=True))

cm = confusion_matrix(y_test, y_pred_test)
print("Matriz de confusão")
plot_cm(cm)

print("Resultado do Cross Validation")
print(cross_val_score(LogisticRegression(random_state=42), x_train, y_train, cv=5, verbose=0, n_jobs=-1).mean())

#Apenas separando execucoes
print("--------------------------------------FIM DA EXECUCAO--------------------------------------")
print("")
print("")
print("")
print("")
print("")