In [60]:
# Basic libraries
import pandas as pd

# Classification models
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

# Vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Utilities and metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Preprocessing
import nltk
import re

# Download nltk resources
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Rafael\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Exploração do Dataset

In [61]:
path = "../data/multilabel/"
df = pd.read_csv(path + 'mLabel_tweets.csv')
df.head()

Unnamed: 0,ID,tweet,labels
0,1296010336907038720t,@cath__kath AstraZeneca is made with the kidne...,ingredients
1,1336808189677940736t,It begins. Please find safe alternatives to th...,side-effect
2,1329488407307956231t,"@PaolaQP1231 Well, I mean congratulations Covi...",side-effect
3,1364194604459900934t,@BorisJohnson for those of us that do not wish...,mandatory
4,1375938799247765515t,She has been trying to speak out: writing lett...,side-effect rushed


## Preprocessamento do Dataset

In [62]:
stop_words = set(nltk.corpus.stopwords.words('english'))
lemmatizer = nltk.stem.WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()

    text = re.sub(r'[^\w\s]', '', text)
    
    words = text.split()
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

df['tweet'] = df['tweet'].apply(preprocess_text)
df

Unnamed: 0,ID,tweet,labels
0,1296010336907038720t,cath__kath astrazeneca made kidney cell little...,ingredients
1,1336808189677940736t,begin please find safe alternative vaccine uk ...,side-effect
2,1329488407307956231t,paolaqp1231 well mean congratulation covid19 f...,side-effect
3,1364194604459900934t,borisjohnson u wish vaccine given vaccine pass...,mandatory
4,1375938799247765515t,trying speak writing letter government speakin...,side-effect rushed
...,...,...,...
9916,1388469392866938880t,former pfizer chief scientific officer experim...,side-effect
9917,1352957607393300485t,garygilligan manufacturer saying manufacturer ...,pharma
9918,1357484621542268928t,thats complete oxfordastrazeneca vaccine swiss...,none
9919,1371121610057388037t,opinion vaccine side effect possible penicilli...,side-effect


## Divisão do Dataset em treino e teste

In [63]:
features = df['tweet']

mlb = MultiLabelBinarizer()
target = mlb.fit_transform(df['labels'].apply(lambda x: x.split(' ')))

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=421)

## Definição dos vetorizadores, modelos e parâmetros para o GridSearch

In [64]:
vectorizers = [
    TfidfVectorizer(),
    CountVectorizer()
]

param_grids = [
    {
        'vectorizer': vectorizers,
        'classifier': [OneVsRestClassifier(RandomForestClassifier())],
        'classifier__estimator__n_estimators': [100, 150, 200],
        'classifier__estimator__max_depth': [None, 10, 20]
    },
    {
        'vectorizer': vectorizers,
        'classifier': [OneVsRestClassifier(SVC())],
        'classifier__estimator__C': [1, 10],
        'classifier__estimator__kernel': ['linear', 'rbf']
    },
    {
        'vectorizer': vectorizers,
        'classifier': [OneVsRestClassifier(MultinomialNB())],
        'classifier__estimator__alpha': [0.01, 1.0]
    },
    {
        'vectorizer': vectorizers,
        'classifier': [OneVsRestClassifier(LogisticRegression())],
        'classifier__estimator__C': [1, 10],
    },
    {
        'vectorizer': vectorizers,
        'classifier': [OneVsRestClassifier(SGDClassifier())],
        'classifier__estimator__alpha': [0.0001, 0.001]
    }
]

pipeline = Pipeline([
    ('vectorizer', 'passthrough'),
    ('classifier', 'passthrough')
])

## Execução do GridSearch

In [65]:
grid_search = GridSearchCV(pipeline, param_grids, cv=5, n_jobs=6, verbose=3, scoring='accuracy')

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 38 candidates, totalling 190 fits


## Resultados finais e matriz de confusão

In [66]:
cv_results = grid_search.cv_results_

results_df = pd.DataFrame(cv_results)

results_df.to_csv('results/sklearn_multilabel1_results.csv', sep=';', index=False)

In [67]:
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_}")

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("\nTest set classification report with best model:")
print(classification_report(y_test, y_pred, digits=4))

Best parameters found: {'classifier': OneVsRestClassifier(estimator=SGDClassifier()), 'classifier__estimator__alpha': 0.0001, 'vectorizer': TfidfVectorizer()}
Best cross-validation score: 0.42615909491317205

Test set classification report with best model:
              precision    recall  f1-score   support

           0     0.8824    0.1546    0.2632        97
           1     0.6190    0.2826    0.3881        46
           2     0.7569    0.4970    0.6000       332
           3     0.9020    0.4894    0.6345        94
           4     0.8500    0.5215    0.6464       163
           5     1.0000    0.0278    0.0541       108
           6     0.7161    0.3978    0.5115       279
           7     0.7667    0.1756    0.2857       131
           8     1.0000    0.5882    0.7407        17
           9     0.8020    0.5418    0.6467       299
          10     0.8260    0.7213    0.7701       757
          11     0.7143    0.2290    0.3468       131

   micro avg     0.8007    0.4927    0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [68]:
import pickle
# Save the model with pickle
filename = 'models/sklearn_multilabel1_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(best_model, file)