imports

In [22]:
import pandas as pd
from sklearn.calibration import cross_val_predict, LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn import datasets, linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

preparando dados

In [14]:
filePath = 'docs/base_de_treino_pre_processada.json'
test_size = 0.3

# Carregar o arquivo JSON em um DataFrame
data = pd.read_json(filePath)

# Dividir os dados em conjuntos de treinamento e teste
x_train, x_test, y_train, y_test = train_test_split(data['TweetContent'], data['IsRelated'], test_size=test_size, random_state=42)

x_train = [' '.join(tweet) for tweet in x_train]
x_test = [' '.join(tweet) for tweet in x_test]

CountVectorizer para contar frequencia

In [15]:
vectorize = CountVectorizer()
word_count_matrix = vectorize.fit_transform(x_train)
# print(word_count_matrix)

count_list = word_count_matrix.toarray().sum(axis=0)
word_list = vectorize.get_feature_names_out()

word_freq = pd.DataFrame(count_list, index=word_list, columns=['Freq'])
word_freq.sort_values(by='Freq', ascending=False).head(30)

Unnamed: 0,Freq
data,587
leak,502
facebook,484
user,209
million,84
notify,75
affected,65
number,64
phone,61
plan,59


TF-IDF

In [16]:
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(word_count_matrix)

print(x_train_tfidf.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


funcao validação cruzada

In [17]:
def execute_naive_bayes(x_train, y_train):
    nb_classifier = MultinomialNB()
    nb_classifier.fit(x_train, y_train)

    results_nb = cross_val_predict(nb_classifier, x_train, y_train, cv = 10)

    return results_nb

def execute_logistic_regression(x_train, y_train):
    lr_classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=42, multi_class='multinomial')
    lr_classifier.fit(x_train, y_train)

    results_lr = cross_val_predict(lr_classifier, x_train, y_train, cv = 10)

    return results_lr

def execute_SVM(x_train, y_train):
    svm_classifier = LinearSVC()
    svm_classifier.fit(x_train, y_train)

    results_svm = cross_val_predict(svm_classifier, x_train, y_train, cv = 10)

    return results_svm

def execute_random_forest(x_train, y_train):
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_classifier.fit(x_train, y_train)

    results_rf = cross_val_predict(rf_classifier, x_train, y_train, cv = 10)

    return results_rf

In [18]:
results_nb = execute_naive_bayes(x_train_tfidf, y_train)
results_lr = execute_logistic_regression(x_train_tfidf, y_train)
results_svm = execute_SVM(x_train_tfidf, y_train)
results_rf = execute_random_forest(x_train_tfidf, y_train)



metricas

In [26]:
# Coletando metricas [acuracia, precisao, revocacao]
metrics_nb = [accuracy_score(y_train, results_nb),
                precision_score(y_train, results_nb), 
                recall_score(y_train, results_nb)]

metrics_lr = [accuracy_score(y_train, results_lr),
                precision_score(y_train, results_lr),
                recall_score(y_train, results_lr)]

metrics_svm = [accuracy_score(y_train, results_svm),
                precision_score(y_train, results_svm),
                recall_score(y_train, results_svm)]

metrics_rf = [accuracy_score(y_train, results_rf),
                precision_score(y_train, results_rf),
                recall_score(y_train, results_rf)]

# Printando metricas
print('Naive Bayes')
print('Acuracia: ', metrics_nb[0])
print('Precisao: ', metrics_nb[1])
print('Revocacao: ', metrics_nb[2])

print('\nLogistic Regression')
print('Acuracia: ', metrics_lr[0])
print('Precisao: ', metrics_lr[1])
print('Revocacao: ', metrics_lr[2])

print('\nSVM')
print('Acuracia: ', metrics_svm[0])
print('Precisao: ', metrics_svm[1])
print('Revocacao: ', metrics_svm[2])

print('\nRandom Forest')
print('Acuracia: ', metrics_rf[0])
print('Precisao: ', metrics_rf[1])
print('Revocacao: ', metrics_rf[2])

Naive Bayes
Acuracia:  0.7816326530612245
Precisao:  0.7824267782426778
Revocacao:  0.9920424403183024

Logistic Regression
Acuracia:  0.8204081632653061
Precisao:  0.8134490238611713
Revocacao:  0.9946949602122016

SVM
Acuracia:  0.8469387755102041
Precisao:  0.8544600938967136
Revocacao:  0.9655172413793104

Random Forest
Acuracia:  0.8326530612244898
Precisao:  0.8270509977827051
Revocacao:  0.9893899204244032


matriz confusao

In [20]:
print('Naive Bayes')
print(pd.crosstab(y_train, results_nb, rownames=['Real'], colnames=['Predito'], margins=True))
print('\nLogistic Regression')
print(pd.crosstab(y_train, results_lr, rownames=['Real'], colnames=['Predito'], margins=True))
print('\nSVM')
print(pd.crosstab(y_train, results_svm, rownames=['Real'], colnames=['Predito'], margins=True))
print('\nRandom Forest')
print(pd.crosstab(y_train, results_rf, rownames=['Real'], colnames=['Predito'], margins=True))

Naive Bayes
Predito   0    1  All
Real                 
0         9  104  113
1         3  374  377
All      12  478  490

Logistic Regression
Predito   0    1  All
Real                 
0        27   86  113
1         2  375  377
All      29  461  490

SVM
Predito   0    1  All
Real                 
0        51   62  113
1        13  364  377
All      64  426  490

Random Forest
Predito   0    1  All
Real                 
0        35   78  113
1         4  373  377
All      39  451  490
