In [2]:
import sys
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.utils import class_weight
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

In [2]:
datasets_folder = "../datasets"
file_name = "tupi_binary.csv"

In [3]:
def read_csv_file(file_name):
    file_path = os.path.join(datasets_folder, file_name)
    if os.path.isfile(file_path):
        return pd.read_csv(file_path)
    return None

df = read_csv_file(file_name)

In [4]:
df.head()

Unnamed: 0,source,id,text,researcher,year,aggressive,hate
0,twitter,1.65848623693028e+18,@user @user @user quanto vc pagava na época da...,oliveira et al,2023,1,1
1,twitter,1.65848623777333e+18,@user os árabes já vão lhes chutar do país ??,oliveira et al,2023,1,1
2,twitter,1.65848960585394e+18,@user @user @user @user @user tem que desenhar...,oliveira et al,2023,1,1
3,twitter,1.65849012716374e+18,@user @user chola mais gado. e se não quiser p...,oliveira et al,2023,1,1
4,twitter,1.65849018793945e+18,michele micheque nao tinha cartao do bolsonaro...,oliveira et al,2023,1,1


In [5]:
# Split the DataFrame into training and testing sets
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [6]:
# Set target and features
target = "hate"
features = "text"

# Set train and test
df_train[features] = df_train[features].values.astype("U")
df_test[features] = df_test[features].values.astype("U")

X_train, y_train = df_train[features], df_train[target]
X_test, y_test = df_test[features], df_test[target]


# Set k-fold criteria
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)


# Class weights
pos = len(df_train.query("hate==1"))
neg = len(df_train.query("hate==0"))
weight_for_0 = (1 / neg) * (len(df_train) / 2.0)
weight_for_1 = (1 / pos) * (len(df_train) / 2.0) * 1.15
relative_weight = {0: weight_for_0, 1: weight_for_1}

In [7]:
# Text vectorizer
vectorizer = TfidfVectorizer(
    lowercase=False,
    analyzer="word",
    norm="l2",
    ngram_range=(1, 2),
    max_features=1500,
    sublinear_tf=True,
    min_df=2,
)

In [13]:
classifier = RandomForestClassifier(
    random_state=42,
    class_weight=relative_weight,
    min_samples_split=2,
    oob_score=True
)

In [14]:
ml_pipe = Pipeline(
        [
            ("vectorizer", vectorizer),
            ("classifier", classifier),
        ]
    )

In [15]:
ml_pipe.fit(X_train, y_train)

### FIM DO TREINO

In [26]:
y_predict = ml_pipe.predict(X_train)

In [27]:
def evaluate(y_test, y_predict):
    pre = precision_score(y_test, y_predict)
    acc = accuracy_score(y_test, y_predict)
    rec = recall_score(y_test, y_predict)
    auc = roc_auc_score(y_test, y_predict)
    f1 = f1_score(y_test, y_predict)
    return (pre, acc, rec, auc, f1)

In [28]:
evaluate(y_train, y_predict)

(0.9407270201435517,
 0.9898093547833057,
 0.9759788613980303,
 0.983829669885262,
 0.958028766800283)

In [30]:
y_predict = ml_pipe.predict(X_test)

In [31]:
evaluate(y_test, y_predict)

(0.5654761904761905,
 0.8802381497595604,
 0.1743119266055046,
 0.5776059894670642,
 0.2664796633941094)