In [1]:
%run tools
from tools import *
from sklearn.model_selection import train_test_split
import pandas as pd
from time import time
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [2]:
df = load_database()
bag_of_words_data, bag_of_words = bag_of_words(df['content'].values)
tf_idf_data, tf_idf = tf_idf(df['content'].values)

bag_of_words_data = bag_of_words_data.toarray()
tf_idf_data = tf_idf_data.toarray()

In [31]:
def run_naive_bayes(f_train, f_test, l_train, l_test):
    start = time()
    c = GaussianNB()
    c.fit(f_train, l_train)
    end = time()
    scores = evaluate(l_test, c.predict(f_test))
    return ["Naive Bayes", end - start] + scores

In [32]:
def run_regression_logistic(f_train, f_test, l_train, l_test):
    start = time()
    c = LogisticRegression(solver='lbfgs')
    c.fit(f_train, l_train)
    end = time()
    scores = c.score(f_test, l_test)
    print(classification_report(l_test, c.predict(f_test)))
    return ["Logistic Regression", end - start, scores, "", "", ""]

In [33]:
def run_random_forest(f_train, f_test, l_train, l_test):
    start = time()
    c = RandomForestClassifier(n_estimators=1000)
    c.fit(f_train, l_train)
    end = time()
    scores = evaluate(l_test, c.predict(f_test))
    return ["Random Forest", end - start] + scores

In [34]:
def run_knn(f_train, f_test, l_train, l_test):
    start = time()
    c = KNeighborsClassifier(n_neighbors=30)
    c.fit(f_train, l_train)
    end = time()
    scores = evaluate(l_test, c.predict(f_test))
    return ["KNN", end - start] + scores

In [35]:
def run_svm(f_train, f_test, l_train, l_test):
    start = time()
    c = SVC(gamma='auto')
    c.fit(f_train, l_train)
    end = time()
    scores = evaluate(l_test, c.predict(f_test))
    return ["SVM", end - start] + scores

In [36]:
def run_ada_boost(f_train, f_test, l_train, l_test):
    start = time()
    c = AdaBoostClassifier(n_estimators=1000, learning_rate=0.3)
    c.fit(f_train, l_train)
    end = time()
    scores = evaluate(l_test, c.predict(f_test))
    return ["Ada Boost", end - start] + scores

In [37]:
def run_mlp(f_train, f_test, l_train, l_test):
    start = time()
    c = MLPClassifier(hidden_layer_sizes=(256, 64, 32))
    c.fit(f_train, l_train)
    end = time()
    scores = evaluate(l_test, c.predict(f_test))
    return ["MLP", end - start] + scores

In [38]:
def run_decision_tree(f_train, f_test, l_train, l_test):
    start = time()
    c = DecisionTreeClassifier()
    c.fit(f_train, l_train)
    end = time()
    scores = evaluate(l_test, c.predict(f_test))
    return ["Decision Tree", end - start] + scores

In [39]:
def run_compare():
    
    header = ["Model", "Time", "Accuracy", "Precision", "Recall", "F1-Measure"]
    Y = df.label.values
    
    print("Testando Bag of Words")
    
    results = []
    X_train, X_test, Y_train, Y_test = train_test_split(bag_of_words_data, Y, test_size=0.33, random_state=42)
#     Rodar os classificadores
    results.append(run_naive_bayes(X_train, X_test, Y_train, Y_test))
    results.append(run_regression_logistic(X_train, X_test, Y_train, Y_test))
    results.append(run_random_forest(X_train, X_test, Y_train, Y_test))
    results.append(run_knn(X_train, X_test, Y_train, Y_test))
    results.append(run_svm(X_train, X_test, Y_train, Y_test))
    results.append(run_ada_boost(X_train, X_test, Y_train, Y_test))
    results.append(run_mlp(X_train, X_test, Y_train, Y_test))
    results.append(run_decision_tree(X_train, X_test, Y_train, Y_test))
    
    pd.DataFrame(results, columns=header).to_csv(path_or_buf=f"./results/result_bag_of_words.csv", index=False)
    
    print("Testando TF-IDF")
    
    results = []
    X_train, X_test, Y_train, Y_test = train_test_split(tf_idf_data, Y, test_size=0.33, random_state=42)
    # Rodar os classificadores
    results.append(run_naive_bayes(X_train, X_test, Y_train, Y_test))
    results.append(run_regression_logistic(X_train, X_test, Y_train, Y_test))
    results.append(run_random_forest(X_train, X_test, Y_train, Y_test))
    results.append(run_knn(X_train, X_test, Y_train, Y_test))
    results.append(run_svm(X_train, X_test, Y_train, Y_test))
    results.append(run_ada_boost(X_train, X_test, Y_train, Y_test))
    results.append(run_mlp(X_train, X_test, Y_train, Y_test))
    results.append(run_decision_tree(X_train, X_test, Y_train, Y_test))
    
    pd.DataFrame(results, columns=header).to_csv(path_or_buf=f"./results/result_tf_idf.csv", index=False)
run_compare()

Testando Bag of Words
              precision    recall  f1-score   support

           0       0.82      0.86      0.84        37
           1       0.84      0.79      0.82        34

    accuracy                           0.83        71
   macro avg       0.83      0.83      0.83        71
weighted avg       0.83      0.83      0.83        71

Testando TF-IDF
              precision    recall  f1-score   support

           0       0.56      0.54      0.55        37
           1       0.51      0.53      0.52        34

    accuracy                           0.54        71
   macro avg       0.53      0.53      0.53        71
weighted avg       0.54      0.54      0.54        71



In [40]:
pd.read_csv('./results/result_bag_of_words.csv')

Unnamed: 0,Model,Time,Accuracy,Precision,Recall,F1-Measure
0,Naive Bayes,0.017659,0.788732,0.731707,0.882353,0.8
1,Logistic Regression,0.087766,0.830986,,,
2,Random Forest,1.496737,0.746479,0.673913,0.911765,0.775
3,KNN,0.006797,0.43662,0.431818,0.558824,0.487179
4,SVM,0.120642,0.478873,0.478873,1.0,0.647619
5,Ada Boost,10.510035,0.816901,0.862069,0.735294,0.793651
6,MLP,2.558299,0.746479,0.735294,0.735294,0.735294
7,Decision Tree,0.019681,0.704225,0.709677,0.647059,0.676923


In [41]:
pd.read_csv('./results/result_tf_idf.csv')

Unnamed: 0,Model,Time,Accuracy,Precision,Recall,F1-Measure
0,Naive Bayes,0.011871,0.746479,0.710526,0.794118,0.75
1,Logistic Regression,0.018484,0.535211,,,
2,Random Forest,3.687467,0.647887,0.609756,0.735294,0.666667
3,KNN,0.006927,0.549296,0.541667,0.382353,0.448276
4,SVM,0.116733,0.478873,0.478873,1.0,0.647619
5,Ada Boost,9.413323,0.788732,0.771429,0.794118,0.782609
6,MLP,2.486457,0.788732,0.756757,0.823529,0.788732
7,Decision Tree,0.025587,0.71831,0.75,0.617647,0.677419
