In [64]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score, roc_auc_score
import pandas as pd
import os
import re

In [65]:
def read_csv_file(file_name):
    file_path = os.path.join(datasets_folder, file_name)
    if os.path.isfile(file_path):
        return pd.read_csv(file_path)
    return None

In [66]:
datasets_folder = "../datasets"
file_name = "tupi_binary.csv"
df_tupi = read_csv_file(file_name)

In [67]:
df_tupi.head()

Unnamed: 0,source,id,text,researcher,year,aggressive,hate
0,twitter,1.65848623693028e+18,@user @user @user quanto vc pagava na época da...,oliveira et al,2023,1,1
1,twitter,1.65848623777333e+18,@user os árabes já vão lhes chutar do país ??,oliveira et al,2023,1,1
2,twitter,1.65848960585394e+18,@user @user @user @user @user tem que desenhar...,oliveira et al,2023,1,1
3,twitter,1.65849012716374e+18,@user @user chola mais gado. e se não quiser p...,oliveira et al,2023,1,1
4,twitter,1.65849018793945e+18,michele micheque nao tinha cartao do bolsonaro...,oliveira et al,2023,1,1


### Pre-processing

In [68]:
# Assuming 'text_column' is the column containing the text in your dataset
text_column = 'text'

# List of terms to exclude
exclude_terms = {
    'desse', 'aí', 'n', 'https', '@', 'user', 'link', '#', '??', '!!', '_:', '.:', '!:', '? ?', '! !', '_ :', '! :', '? :', 'rt',
    'ta', 'tá', 'q', 'pq', 'ter', 'pra', 'vcs', 'todos', 'aí', 'nunca', 'fala', 'ver', 'coisa', 'desse', 'todo', 'quer', 'agora', 'faz',
    'n', 'fazer', 'ainda', 'dia', 'pode', 'tudo', 'nao', 'nada', 'vc', 'vai', 'pq', 'por que', 'porque', 'eh', 'ne', 'né', 'é', 'p',
    'la', 'lá', 'ai', 'aí', 'to', 'tô','sobre','fez','pois','onde','aqui','pro','dar','ficar','fica','d','[]'
}

import re

# Function to preprocess text by excluding terms
def preprocess_text(text):
    # Split the text into words using regular expression
    words = re.findall(r'\b\w+\b', text)

    # Exclude terms in a case-insensitive manner
    filtered_words = [word for word in words if word.lower() not in exclude_terms]

    # Join the filtered words back into a string
    filtered_text = ' '.join(filtered_words)

    return filtered_text

# Apply the preprocessing function to the 'text_column' in your DataFrame
df_tupi['text'] = df_tupi['text'].apply(preprocess_text)

### Split data

In [69]:
# Assuming your DataFrame is named 'df'
# Assuming 'aggressive' is your target variable

# Stratified sampling
train_df, test_df = train_test_split(df_tupi, test_size=0.2, stratify=df_tupi['hate'], random_state=42)

# Display the sizes of the resulting DataFrames
print(f"Original DataFrame size: {len(df_tupi)}")
print(f"Train DataFrame size: {len(train_df)}")
print(f"Test DataFrame size: {len(test_df)}")

Original DataFrame size: 43668
Train DataFrame size: 34934
Test DataFrame size: 8734


In [70]:
# Class weights
pos = len(train_df.query("hate==1"))
neg = len(train_df.query("hate==0"))
weight_for_0 = (1 / neg) * (len(train_df) / 2.0)
weight_for_1 = (1 / pos) * (len(train_df) / 2.0) * 1.15
relative_weight = {0: weight_for_0, 1: weight_for_1}

### TF-IDF

In [71]:
# Initialize the TfidfVectorizer with optional parameters
tfidf_vectorizer = TfidfVectorizer(
    lowercase=False,
    analyzer="word",
    norm="l2",
    sublinear_tf=True,
    min_df=2,
    max_features=1500, 
    ngram_range=(1, 2),)

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['text'])

# Transform the test data
X_test_tfidf = tfidf_vectorizer.transform(test_df['text'])

# Optional: Convert the TF-IDF matrices to Pandas DataFrames for better understanding
tfidf_train_df = pd.DataFrame(X_train_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_test_df = pd.DataFrame(X_test_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Optional: Display the TF-IDF DataFrames
print("TF-IDF Training Data:")
print(tfidf_train_df.head())

print("TF-IDF Test Data:")
print(tfidf_test_df.head())

TF-IDF Training Data:
    00   03   08   10  100   11   12   13   14   15  ...  vôlei   às  água  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0  0.0   0.0   
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0  0.0   0.0   
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0  0.0   0.0   
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0  0.0   0.0   
4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0  0.0   0.0   

   época  ódio  ótimo  última  último  única  único  
0    0.0   0.0    0.0     0.0     0.0    0.0    0.0  
1    0.0   0.0    0.0     0.0     0.0    0.0    0.0  
2    0.0   0.0    0.0     0.0     0.0    0.0    0.0  
3    0.0   0.0    0.0     0.0     0.0    0.0    0.0  
4    0.0   0.0    0.0     0.0     0.0    0.0    0.0  

[5 rows x 1500 columns]
TF-IDF Test Data:
    00   03   08   10  100   11   12   13   14   15  ...  vôlei   às  água  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0 

### Multi model

In [72]:
# Assuming 'aggressive' is your target variable
y_train = train_df['hate']
y_test = test_df['hate']

In [73]:
# Decision Tree
dt_model = DecisionTreeClassifier(
    random_state=42,
    class_weight=relative_weight,
    min_samples_split=5,  # Adjusted
    max_depth=15,  # Adjusted
)

dt_model.fit(X_train_tfidf, y_train)
dt_pred = dt_model.predict(X_test_tfidf)

In [74]:
# Random Forest
rf_model = RandomForestClassifier(
    random_state=42,
    class_weight=relative_weight,
    min_samples_split=2,
    n_estimators=150,  # Adjusted
    max_depth=20,  # Adjusted
    oob_score=True,
    )

rf_model.fit(X_train_tfidf, y_train)
rf_pred = rf_model.predict(X_test_tfidf)

In [75]:
# Linear SVC
svc_model = LinearSVC(
    penalty="l2",
    loss="squared_hinge",
    dual=True,
    tol=1e-4,
    C=0.5,  # Adjusted
    multi_class="crammer_singer",
    fit_intercept=True,
    intercept_scaling=0.8,  # Adjusted
    class_weight=relative_weight,
    random_state=42,
    max_iter=1500,  # Adjusted
)
svc_model.fit(X_train_tfidf, y_train)
svc_pred = svc_model.predict(X_test_tfidf)



In [76]:
# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_tfidf, y_train)
dt_pred = dt_model.predict(X_test_tfidf)

# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_tfidf, y_train)
rf_pred = rf_model.predict(X_test_tfidf)

# Linear SVC
svc_model = LinearSVC(random_state=42)
svc_model.fit(X_train_tfidf, y_train)
svc_pred = svc_model.predict(X_test_tfidf)



In [77]:
# Evaluate the models
def evaluate_model(model_name, y_true, y_pred, y_prob=None):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_prob) if y_prob is not None else None
    
    print(f"\n{model_name} Evaluation:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    if roc_auc is not None:
        print(f"ROC AUC Score: {roc_auc:.4f}")

# Decision Tree
evaluate_model("Decision Tree", y_test, dt_pred)

# Random Forest
evaluate_model("Random Forest", y_test, rf_pred)

# Linear SVC
svc_prob = svc_model.decision_function(X_test_tfidf)  # Linear SVC does not have predict_proba, but decision_function can be used for ROC AUC
evaluate_model("Linear SVC", y_test, svc_pred, svc_prob)


Decision Tree Evaluation:
Accuracy: 0.8595
Precision: 0.3972
Recall: 0.3235
F1 Score: 0.3566

Random Forest Evaluation:
Accuracy: 0.8904
Precision: 0.6451
Recall: 0.1989
F1 Score: 0.3040

Linear SVC Evaluation:
Accuracy: 0.8910
Precision: 0.6486
Recall: 0.2055
F1 Score: 0.3121
ROC AUC Score: 0.8025
