# Importing libraries

In [19]:
# Basic libraries
import pandas as pd
import time

# Classification models
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB

# Vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Utilities and metrics
from itertools import product
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score, precision_score, recall_score

# Preprocessing
import nltk
import re

# Download nltk resources
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rafae\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Setting seeds

In [20]:
s1 = 418380670049
s2 = 5522499
s3 = 29114468376207731

seeds = [s1, s2, s3]

# Importing datasets

In [21]:
path = r'../data/binary/hate/'

train = pd.read_parquet(path + 'train-00000-of-00001.parquet')
test = pd.read_parquet(path + 'test-00000-of-00001.parquet')
val = pd.read_parquet(path + 'validation-00000-of-00001.parquet')

# Dataset preprocessing

In [22]:
stop_words = set(nltk.corpus.stopwords.words('english'))
lemmatizer = nltk.stem.WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()

    text = re.sub(r'[^\w\s]', '', text)
    
    words = text.split()
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

train['text'] = train['text'].apply(preprocess_text)

# Text vectorization

In [23]:
tfidf = TfidfVectorizer()
count = CountVectorizer()

vectorizers = [
    TfidfVectorizer(),
    CountVectorizer()
]

# GridSearch implementation

In [24]:
models = {
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [100, 150, 200],
            'max_depth': [None, 10, 20]
        }
    },
    'SVC': {
        'model': SVC(),
        'params': {
            'C': [1, 10],
            'kernel': ['linear', 'rbf']
        }
    },
    'MultinomialNB': {
        'model': MultinomialNB(),
        'params': {
            'alpha': [0.01, 1.0]
        }
    },
    'LogisticRegression': {
        'model': LogisticRegression(max_iter=1000),
        'params': {
            'C': [1, 10]
        }
    },
    'KNeighbors': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3, 5, 7]
        }
    },
    'DecisionTree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'max_depth': [None, 10, 20]
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(),
        'params': {
            'n_estimators': [100, 150, 200],
            'max_depth': [3, 5, 7]
        }
    },
    'AdaBoost': {
        'model': AdaBoostClassifier(),
        'params': {
            'n_estimators': [50, 100, 150]
        }
    },
    'SGD': {
        'model': SGDClassifier(),
        'params': {
            'alpha': [0.0001, 0.001]
        }
    }
}

In [42]:
results = {}

for seed in seeds:
    print(f"Processing seed: {seed}")
    for vectorizer in vectorizers:
        print(f"Processing vectorizer: {vectorizer}")
        for name, info in models.items():
            print(f"Processing model: {name}")
            model = info['model']
            param_grid = info['params']
            param_combinations = product(*param_grid.values())
            
            for combination in param_combinations:
                params = dict(zip(param_grid.keys(), combination))
                model.set_params(**params)

                print(f"Training {name} with params {params} and vectorizer {vectorizer}")
                pipeline = Pipeline([
                    ('vectorizer', vectorizer),
                    ('model', model)
                ])

                start = time.time()
                pipeline.fit(train['text'], train['label'])
                total_time = time.time() - start
                print(f"Training time: {total_time}")

                y_pred = pipeline.predict(val['text'])
                report = classification_report(val['label'], y_pred, output_dict=True)
                
                result_key = (seed, vectorizer, name, tuple(params.items()))
                results[result_key] = {
                    'params': params,
                    'training_time': total_time,
                    'classification_report': report
                }









Processing seed: 418380670049
Processing vectorizer: TfidfVectorizer()
Processing model: RandomForest
Training RandomForest with params {'n_estimators': 100, 'max_depth': None} and vectorizer TfidfVectorizer()
Training time: 16.534723043441772
Training RandomForest with params {'n_estimators': 100, 'max_depth': 10} and vectorizer TfidfVectorizer()
Training time: 0.626702070236206
Training RandomForest with params {'n_estimators': 100, 'max_depth': 20} and vectorizer TfidfVectorizer()
Training time: 1.3966021537780762
Training RandomForest with params {'n_estimators': 150, 'max_depth': None} and vectorizer TfidfVectorizer()
Training time: 23.289532899856567
Training RandomForest with params {'n_estimators': 150, 'max_depth': 10} and vectorizer TfidfVectorizer()
Training time: 0.775524377822876
Training RandomForest with params {'n_estimators': 150, 'max_depth': 20} and vectorizer TfidfVectorizer()
Training time: 1.8080103397369385
Training RandomForest with params {'n_estimators': 200, 



Training time: 1.7479774951934814
Training AdaBoost with params {'n_estimators': 100} and vectorizer TfidfVectorizer()




Training time: 2.8582088947296143
Training AdaBoost with params {'n_estimators': 150} and vectorizer TfidfVectorizer()




Training time: 4.271351099014282
Processing model: SGD
Training SGD with params {'alpha': 0.0001} and vectorizer TfidfVectorizer()
Training time: 0.17654061317443848
Training SGD with params {'alpha': 0.001} and vectorizer TfidfVectorizer()
Training time: 0.15640568733215332
Processing vectorizer: CountVectorizer()
Processing model: RandomForest
Training RandomForest with params {'n_estimators': 100, 'max_depth': None} and vectorizer CountVectorizer()
Training time: 18.72861671447754
Training RandomForest with params {'n_estimators': 100, 'max_depth': 10} and vectorizer CountVectorizer()
Training time: 0.5713338851928711
Training RandomForest with params {'n_estimators': 100, 'max_depth': 20} and vectorizer CountVectorizer()
Training time: 1.2483413219451904
Training RandomForest with params {'n_estimators': 150, 'max_depth': None} and vectorizer CountVectorizer()
Training time: 27.840571641921997
Training RandomForest with params {'n_estimators': 150, 'max_depth': 10} and vectorizer C



Training time: 0.9764914512634277
Training AdaBoost with params {'n_estimators': 100} and vectorizer CountVectorizer()




Training time: 1.800144910812378
Training AdaBoost with params {'n_estimators': 150} and vectorizer CountVectorizer()




Training time: 2.5899338722229004
Processing model: SGD
Training SGD with params {'alpha': 0.0001} and vectorizer CountVectorizer()
Training time: 0.20014691352844238
Training SGD with params {'alpha': 0.001} and vectorizer CountVectorizer()
Training time: 0.1633446216583252
Processing seed: 5522499
Processing vectorizer: TfidfVectorizer()
Processing model: RandomForest
Training RandomForest with params {'n_estimators': 100, 'max_depth': None} and vectorizer TfidfVectorizer()
Training time: 15.47559142112732
Training RandomForest with params {'n_estimators': 100, 'max_depth': 10} and vectorizer TfidfVectorizer()
Training time: 0.6708724498748779
Training RandomForest with params {'n_estimators': 100, 'max_depth': 20} and vectorizer TfidfVectorizer()
Training time: 1.391667366027832
Training RandomForest with params {'n_estimators': 150, 'max_depth': None} and vectorizer TfidfVectorizer()
Training time: 23.12375569343567
Training RandomForest with params {'n_estimators': 150, 'max_depth



Training time: 0.985710620880127
Training AdaBoost with params {'n_estimators': 100} and vectorizer TfidfVectorizer()




Training time: 2.5080273151397705
Training AdaBoost with params {'n_estimators': 150} and vectorizer TfidfVectorizer()




Training time: 4.884032487869263
Processing model: SGD
Training SGD with params {'alpha': 0.0001} and vectorizer TfidfVectorizer()
Training time: 0.1859750747680664
Training SGD with params {'alpha': 0.001} and vectorizer TfidfVectorizer()
Training time: 0.1650841236114502
Processing vectorizer: CountVectorizer()
Processing model: RandomForest
Training RandomForest with params {'n_estimators': 100, 'max_depth': None} and vectorizer CountVectorizer()
Training time: 18.931056261062622
Training RandomForest with params {'n_estimators': 100, 'max_depth': 10} and vectorizer CountVectorizer()
Training time: 0.5125863552093506
Training RandomForest with params {'n_estimators': 100, 'max_depth': 20} and vectorizer CountVectorizer()
Training time: 1.0999157428741455
Training RandomForest with params {'n_estimators': 150, 'max_depth': None} and vectorizer CountVectorizer()
Training time: 27.64098834991455
Training RandomForest with params {'n_estimators': 150, 'max_depth': 10} and vectorizer Cou



Training time: 1.1094961166381836
Training AdaBoost with params {'n_estimators': 100} and vectorizer CountVectorizer()




Training time: 1.8664708137512207
Training AdaBoost with params {'n_estimators': 150} and vectorizer CountVectorizer()




Training time: 2.9118354320526123
Processing model: SGD
Training SGD with params {'alpha': 0.0001} and vectorizer CountVectorizer()
Training time: 0.2039778232574463
Training SGD with params {'alpha': 0.001} and vectorizer CountVectorizer()
Training time: 0.16692042350769043
Processing seed: 29114468376207731
Processing vectorizer: TfidfVectorizer()
Processing model: RandomForest
Training RandomForest with params {'n_estimators': 100, 'max_depth': None} and vectorizer TfidfVectorizer()
Training time: 15.74964165687561
Training RandomForest with params {'n_estimators': 100, 'max_depth': 10} and vectorizer TfidfVectorizer()
Training time: 0.680387020111084
Training RandomForest with params {'n_estimators': 100, 'max_depth': 20} and vectorizer TfidfVectorizer()
Training time: 1.2430059909820557
Training RandomForest with params {'n_estimators': 150, 'max_depth': None} and vectorizer TfidfVectorizer()
Training time: 22.361841201782227
Training RandomForest with params {'n_estimators': 150,



Training time: 1.5221259593963623
Training AdaBoost with params {'n_estimators': 100} and vectorizer TfidfVectorizer()




Training time: 3.2236428260803223
Training AdaBoost with params {'n_estimators': 150} and vectorizer TfidfVectorizer()




Training time: 4.700650453567505
Processing model: SGD
Training SGD with params {'alpha': 0.0001} and vectorizer TfidfVectorizer()
Training time: 0.15793418884277344
Training SGD with params {'alpha': 0.001} and vectorizer TfidfVectorizer()
Training time: 0.1625232696533203
Processing vectorizer: CountVectorizer()
Processing model: RandomForest
Training RandomForest with params {'n_estimators': 100, 'max_depth': None} and vectorizer CountVectorizer()
Training time: 18.012313842773438
Training RandomForest with params {'n_estimators': 100, 'max_depth': 10} and vectorizer CountVectorizer()
Training time: 0.5814740657806396
Training RandomForest with params {'n_estimators': 100, 'max_depth': 20} and vectorizer CountVectorizer()
Training time: 1.2891592979431152
Training RandomForest with params {'n_estimators': 150, 'max_depth': None} and vectorizer CountVectorizer()
Training time: 27.93504023551941
Training RandomForest with params {'n_estimators': 150, 'max_depth': 10} and vectorizer Co



Training time: 0.964245080947876
Training AdaBoost with params {'n_estimators': 100} and vectorizer CountVectorizer()




Training time: 1.8461363315582275
Training AdaBoost with params {'n_estimators': 150} and vectorizer CountVectorizer()




Training time: 2.8862392902374268
Processing model: SGD
Training SGD with params {'alpha': 0.0001} and vectorizer CountVectorizer()
Training time: 0.20819878578186035
Training SGD with params {'alpha': 0.001} and vectorizer CountVectorizer()
Training time: 0.21209049224853516


In [47]:
# convert results to df
results

{(418380670049,
  TfidfVectorizer(),
  'RandomForest',
  (('n_estimators', 100),
   ('max_depth', None))): {'params': {'n_estimators': 100, 'max_depth': None},
  'training_time': 16.534723043441772,
  'classification_report': {'0': {'precision': 0.7223880597014926,
    'recall': 0.8446771378708552,
    'f1-score': 0.7787610619469026,
    'support': 573.0},
   '1': {'precision': 0.7303030303030303,
    'recall': 0.5644028103044496,
    'f1-score': 0.6367239101717305,
    'support': 427.0},
   'accuracy': 0.725,
   'macro avg': {'precision': 0.7263455450022615,
    'recall': 0.7045399740876523,
    'f1-score': 0.7077424860593166,
    'support': 1000.0},
   'weighted avg': {'precision': 0.7257677521483492,
    'recall': 0.725,
    'f1-score': 0.7181111981389041,
    'support': 1000.0}}},
 (418380670049,
  TfidfVectorizer(),
  'RandomForest',
  (('n_estimators', 100), ('max_depth', 10))): {'params': {'n_estimators': 100,
   'max_depth': 10},
  'training_time': 0.626702070236206,
  'classif

# Process results

In [34]:
results.items()

dict_items([((418380670049, TfidfVectorizer(), 'RandomForest', (('n_estimators', 100), ('max_depth', None))), {'params': {'n_estimators': 100, 'max_depth': None}, 'training_time': 17.593130826950073, 'classification_report': {'0': {'precision': 0.7185185185185186, 'recall': 0.8464223385689355, 'f1-score': 0.7772435897435898, 'support': 573.0}, '1': {'precision': 0.7292307692307692, 'recall': 0.5550351288056206, 'f1-score': 0.6303191489361702, 'support': 427.0}, 'accuracy': 0.722, 'macro avg': {'precision': 0.723874643874644, 'recall': 0.7007287336872781, 'f1-score': 0.70378136933988, 'support': 1000.0}, 'weighted avg': {'precision': 0.7230926495726496, 'recall': 0.722, 'f1-score': 0.7145068535188216, 'support': 1000.0}}}), ((418380670049, TfidfVectorizer(), 'RandomForest', (('n_estimators', 100), ('max_depth', 10))), {'params': {'n_estimators': 100, 'max_depth': 10}, 'training_time': 0.7226264476776123, 'classification_report': {'0': {'precision': 0.5916666666666667, 'recall': 0.991273