In [1]:
import pandas as pd
import numpy as np
import gspread
import os
import json

In [2]:
import re

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [43]:
from imblearn.under_sampling import RandomUnderSampler

In [4]:
gc = gspread.service_account()

In [5]:
sh = gc.open("supervised_learning")
worksheet = sh.get_worksheet(0)
data = worksheet.get_all_values()[1:]
df = pd.DataFrame(data)

In [6]:
# define columns
df.columns = ['date', 'username','text', 'label_ant', 'label_pro']

In [7]:
def preprocess_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = text.replace('&', 'and')
    return text

In [8]:
df['text'] = df['text'].apply(preprocess_text)

undersample

In [49]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label_ant'], test_size=0.3, random_state=42)

In [50]:
vectorizer = TfidfVectorizer(stop_words='english')
X_train_vect = vectorizer.fit_transform(X_train)

In [51]:
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X_train_vect, y_train)

In [52]:
model = MultinomialNB()
model.fit(X_res, y_res)

In [53]:
X_test_vect = vectorizer.transform(X_test)
predictions = model.predict(X_test_vect)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.98      0.66      0.79        73
           1       0.56      0.97      0.71        33

    accuracy                           0.75       106
   macro avg       0.77      0.81      0.75       106
weighted avg       0.85      0.75      0.76       106



oversample

In [54]:
from imblearn.over_sampling import SMOTE

In [55]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label_ant'], test_size=0.3, random_state=42)
vectorizer = TfidfVectorizer(stop_words='english')
X_train_vect = vectorizer.fit_transform(X_train)

In [56]:
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train_vect, y_train)

In [57]:
model = MultinomialNB()
model.fit(X_res, y_res)

In [58]:
X_test_vect = vectorizer.transform(X_test)

In [59]:
predictions = model.predict(X_test_vect)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.95      0.77      0.85        73
           1       0.64      0.91      0.75        33

    accuracy                           0.81       106
   macro avg       0.79      0.84      0.80       106
weighted avg       0.85      0.81      0.82       106



standard

In [60]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label_ant'], test_size=0.3, random_state=42)

In [61]:
pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('nb', MultinomialNB()),
])

In [62]:
pipeline.fit(X_train, y_train)

In [63]:
predictions = pipeline.predict(X_test)

In [64]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89        73
           1       0.74      0.79      0.76        33

    accuracy                           0.85       106
   macro avg       0.82      0.83      0.83       106
weighted avg       0.85      0.85      0.85       106



Try BERT

In [9]:
import torch
from transformers import AutoModel, AutoTokenizer
from transformers import Trainer, TrainingArguments

In [29]:
from transformers import AutoModelForSequenceClassification

In [14]:
from datasets import Dataset

In [63]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [30]:
bertweet = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=2) 
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
df_reduced = df[['text', 'label_ant']]

In [45]:
df_reduced['label_ant'] = df_reduced['label_ant'].astype(int) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduced['label_ant'] = df_reduced['label_ant'].astype(int)


In [46]:
dataset = Dataset.from_pandas(df_reduced)

In [47]:
train_test_split = dataset.train_test_split(test_size=0.2)  
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [54]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

In [55]:
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/280 [00:00<?, ? examples/s]

Map:   0%|          | 0/71 [00:00<?, ? examples/s]

In [56]:
tokenized_train_dataset = tokenized_train_dataset.rename_column("label_ant", "label")
tokenized_train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenized_test_dataset = tokenized_test_dataset.rename_column("label_ant", "label")
tokenized_test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [57]:
training_args = TrainingArguments(
    output_dir="./bertweet-outputs",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
)

trainer = Trainer(
    model=bertweet,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [58]:
trainer.train()

  0%|          | 0/54 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.5558142066001892, 'eval_runtime': 26.5139, 'eval_samples_per_second': 2.678, 'eval_steps_per_second': 0.189, 'epoch': 1.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.6158881187438965, 'eval_runtime': 25.6342, 'eval_samples_per_second': 2.77, 'eval_steps_per_second': 0.195, 'epoch': 2.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.5507413148880005, 'eval_runtime': 28.8166, 'eval_samples_per_second': 2.464, 'eval_steps_per_second': 0.174, 'epoch': 3.0}
{'train_runtime': 1225.6752, 'train_samples_per_second': 0.685, 'train_steps_per_second': 0.044, 'train_loss': 0.3731341538605867, 'epoch': 3.0}


TrainOutput(global_step=54, training_loss=0.3731341538605867, metrics={'train_runtime': 1225.6752, 'train_samples_per_second': 0.685, 'train_steps_per_second': 0.044, 'train_loss': 0.3731341538605867, 'epoch': 3.0})

In [67]:
predictions = trainer.evaluate(tokenized_test_dataset)

  0%|          | 0/5 [00:00<?, ?it/s]

In [61]:
eval_results = trainer.evaluate()

  0%|          | 0/5 [00:00<?, ?it/s]

In [72]:
predict_output = trainer.predict(tokenized_test_dataset)
logits = predict_output.predictions

  0%|          | 0/5 [00:00<?, ?it/s]

In [73]:
pred_labels = np.argmax(logits, axis=1)
true_labels = predict_output.label_ids
accuracy = accuracy_score(true_labels, pred_labels)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average='binary') 

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.7464788732394366
Precision: 0.6666666666666666
Recall: 0.5
F1 Score: 0.5714285714285715
