In [5]:
%run tools
from tools import *
from sklearn.model_selection import train_test_split
import pandas as pd
from time import time
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [6]:
df = load_database()
bag_of_words_data, bag_of_words = bag_of_words(df['content'].values)
tf_idf_data, tf_idf = tf_idf(df['content'].values)

bag_of_words_data = bag_of_words_data.toarray()
tf_idf_data = tf_idf_data.toarray()

In [7]:
def run_naive_bayes(f_train, f_test, l_train, l_test):
    start = time()
    c = GaussianNB()
    c.fit(f_train, l_train)
    end = time()
    scores = evaluate(l_test, c.predict(f_test))
    return ["Naive Bayes", end - start] + scores

In [8]:
def run_regression(f_train, f_test, l_train, l_test):
    start = time()
    c = LogisticRegression(solver='lbfgs')
#     c = LinearRegression()
    c.fit(f_train, l_train)
    end = time()
    scores = c.score(f_test, l_test)
    print(classification_report(l_test, c.predict(f_test)))
    return ["Linear Regression", end - start, scores, "", "", ""]

In [9]:
def run_random_forest(f_train, f_test, l_train, l_test):
    start = time()
    c = RandomForestClassifier(n_estimators=1000)
    c.fit(f_train, l_train)
    end = time()
    scores = evaluate(l_test, c.predict(f_test))
    return ["Random Forest", end - start] + scores

In [10]:
def run_knn(f_train, f_test, l_train, l_test):
    start = time()
    c = KNeighborsClassifier(n_neighbors=30)
    c.fit(f_train, l_train)
    end = time()
    scores = evaluate(l_test, c.predict(f_test))
    return ["KNN", end - start] + scores

In [11]:
def run_svm(f_train, f_test, l_train, l_test):
    start = time()
    c = SVC(gamma='auto')
    c.fit(f_train, l_train)
    end = time()
    scores = evaluate(l_test, c.predict(f_test))
    return ["SVM", end - start] + scores

In [12]:
def run_ada_boost(f_train, f_test, l_train, l_test):
    start = time()
    c = AdaBoostClassifier(n_estimators=1000, learning_rate=0.3)
    c.fit(f_train, l_train)
    end = time()
    scores = evaluate(l_test, c.predict(f_test))
    return ["Ada Boost", end - start] + scores

In [13]:
def run_mlp(f_train, f_test, l_train, l_test):
    start = time()
    c = MLPClassifier(hidden_layer_sizes=(256, 64, 32))
    c.fit(f_train, l_train)
    end = time()
    scores = evaluate(l_test, c.predict(f_test))
    return ["MLP", end - start] + scores

In [19]:
def run_decision_tree(f_train, f_test, l_train, l_test):
    start = time()
    c = DecisionTreeClassifier()
    c.fit(f_train, l_train)
    end = time()
    scores = evaluate(l_test, c.predict(f_test))
    return ["Decision Tree", end - start] + scores

In [15]:
def run_compare():
    
    header = ["Model", "Time", "Accuracy", "Precision", "Recall", "F1-Measure"]
    Y = df.label.values
    
    print("Testando Bag of Words")
    
    results = []
    X_train, X_test, Y_train, Y_test = train_test_split(bag_of_words_data, Y, test_size=0.33, random_state=42)
#     Rodar os classificadores
    results.append(run_naive_bayes(X_train, X_test, Y_train, Y_test))
    results.append(run_regression(X_train, X_test, Y_train, Y_test))
    results.append(run_random_forest(X_train, X_test, Y_train, Y_test))
    results.append(run_knn(X_train, X_test, Y_train, Y_test))
    results.append(run_svm(X_train, X_test, Y_train, Y_test))
    results.append(run_ada_boost(X_train, X_test, Y_train, Y_test))
    results.append(run_mlp(X_train, X_test, Y_train, Y_test))
    results.append(run_decision_tree(X_train, X_test, Y_train, Y_test))
    
    pd.DataFrame(results, columns=header).to_csv(path_or_buf=f"./results/result_bag_of_words.csv", index=False)
    
    print("Testando TF-IDF")
    
    results = []
    X_train, X_test, Y_train, Y_test = train_test_split(tf_idf_data, Y, test_size=0.33, random_state=42)
    # Rodar os classificadores
    results.append(run_naive_bayes(X_train, X_test, Y_train, Y_test))
    results.append(run_regression(X_train, X_test, Y_train, Y_test))
    results.append(run_random_forest(X_train, X_test, Y_train, Y_test))
    results.append(run_knn(X_train, X_test, Y_train, Y_test))
    results.append(run_svm(X_train, X_test, Y_train, Y_test))
    results.append(run_ada_boost(X_train, X_test, Y_train, Y_test))
    results.append(run_mlp(X_train, X_test, Y_train, Y_test))
    results.append(run_decision_tree(X_train, X_test, Y_train, Y_test))
    
    pd.DataFrame(results, columns=header).to_csv(path_or_buf=f"./results/result_tf_idf.csv", index=False)

In [20]:
run_compare()

Testando Bag of Words
              precision    recall  f1-score   support

           0       0.81      0.69      0.74        42
           1       0.71      0.82      0.76        39

    accuracy                           0.75        81
   macro avg       0.76      0.76      0.75        81
weighted avg       0.76      0.75      0.75        81





Testando TF-IDF
              precision    recall  f1-score   support

           0       0.67      0.48      0.56        42
           1       0.57      0.74      0.64        39

    accuracy                           0.60        81
   macro avg       0.62      0.61      0.60        81
weighted avg       0.62      0.60      0.60        81



In [21]:
pd.read_csv('./results/result_bag_of_words.csv')

Unnamed: 0,Model,Time,Accuracy,Precision,Recall,F1-Measure
0,Naive Bayes,0.018991,0.753086,0.711111,0.820513,0.761905
1,Linear Regression,0.112701,0.753086,,,
2,Random Forest,1.729755,0.814815,0.8,0.820513,0.810127
3,KNN,0.006499,0.506173,0.487179,0.487179,0.487179
4,SVM,0.164042,0.481481,0.481481,1.0,0.65
5,Ada Boost,8.99237,0.703704,0.653061,0.820513,0.727273
6,MLP,2.615867,0.777778,0.744186,0.820513,0.780488
7,Decision Tree,0.02746,0.802469,0.767442,0.846154,0.804878


In [22]:
pd.read_csv('./results/result_tf_idf.csv')

Unnamed: 0,Model,Time,Accuracy,Precision,Recall,F1-Measure
0,Naive Bayes,0.011468,0.777778,0.744186,0.820513,0.780488
1,Linear Regression,0.016986,0.604938,,,
2,Random Forest,1.995227,0.617284,0.590909,0.666667,0.626506
3,KNN,0.011142,0.506173,0.487805,0.512821,0.5
4,SVM,0.180463,0.481481,0.481481,1.0,0.65
5,Ada Boost,15.612709,0.703704,0.682927,0.717949,0.7
6,MLP,6.759238,0.777778,0.744186,0.820513,0.780488
7,Decision Tree,0.077739,0.740741,0.714286,0.769231,0.740741
