In [1]:
import sys
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.utils import class_weight
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

In [2]:
datasets_folder = "../datasets"
file_name = "tupi_binary.csv"

In [3]:
def read_csv_file(file_name):
    file_path = os.path.join(datasets_folder, file_name)
    if os.path.isfile(file_path):
        return pd.read_csv(file_path)
    return None

df = read_csv_file(file_name)

### Split train and test

In [4]:
# Split the DataFrame into training and testing sets (use randon and stratify)
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df["hate"])

In [5]:
# Set target and features
target = "hate"
features = "text"

# Set train and test
df_train[features] = df_train[features].values.astype("U")
df_test[features] = df_test[features].values.astype("U")

X_train, y_train = df_train[features], df_train[target]
X_test, y_test = df_test[features], df_test[target]


# Set k-fold criteria
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)


# Class weights
pos = len(df_train.query("hate==1"))
neg = len(df_train.query("hate==0"))
weight_for_0 = (1 / neg) * (len(df_train) / 2.0)
weight_for_1 = (1 / pos) * (len(df_train) / 2.0) 
relative_weight = {0: weight_for_0, 1: weight_for_1}

### Vectorize

In [6]:
# Text vectorizer
vectorizer = TfidfVectorizer(
    lowercase=False,
    analyzer="word",
    norm="l2",
    ngram_range=(1, 2),
    max_features=1500,
    sublinear_tf=True,
    min_df=2,
)

### Model

In [8]:
classifier = LinearSVC(
    random_state=42,
    class_weight=relative_weight
)

In [9]:
ml_pipe = Pipeline(
        [
            ("vectorizer", vectorizer),
            ("classifier", classifier),
        ]
    )

In [10]:
ml_pipe.fit(X_train, y_train)



### Fim do treino

In [11]:
y_predict = ml_pipe.predict(X_train)

In [12]:
def evaluate(y_test, y_predict):
    pre = precision_score(y_test, y_predict)
    acc = accuracy_score(y_test, y_predict)
    rec = recall_score(y_test, y_predict)
    auc = roc_auc_score(y_test, y_predict)
    f1 = f1_score(y_test, y_predict)
    return (pre, acc, rec, auc, f1)

In [13]:
evaluate(y_train, y_predict)

(0.34228886806206393,
 0.7930096753878743,
 0.7822465492622561,
 0.7883639358311803,
 0.47620427381383557)

In [14]:
y_predict = ml_pipe.predict(X_test)

In [15]:
evaluate(y_test, y_predict)

(0.29784537389100124,
 0.7700938859629036,
 0.6707897240723121,
 0.7272339873778194,
 0.4125219426565242)