In [1]:
from sklearn.ensemble import VotingClassifier
from tqdm.notebook import tqdm
import torch
import pandas as pd
import re
from transformers import BertTokenizer, AutoModelForSequenceClassification, AutoTokenizer, GPT2Tokenizer, GPT2ForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

dataset = pd.read_csv('data/train.csv')
dataset = dataset.drop(columns=['keyword', 'location'])
dataset.columns = ['id','text', 'target']

#cleaning the data for upper case, special characters, and links
def clean_text(text):
    text = text.lower()
    text = re.sub(r'@[a-zA-Z0-9_]+', '', text)
    text = re.sub(r'https?://[A-Za-z0-9./]+', '', text)
    return text

X = dataset['text']
y = dataset['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
BertTokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
BertModel = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2)
BertModel.load_state_dict(torch.load('Trained models/BertModel.pt', map_location=torch.device(device), weights_only = True))
BertModel.to(device)
BertModel.eval()

BerTweetTokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
BerTweetModel = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels = 2)
BerTweetModel.load_state_dict(torch.load('Trained models/BerTweetModel.pt', map_location=torch.device(device), weights_only = True))
BerTweetModel.to(device)
BerTweetModel.eval()

GPT2Tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
GPT2Tokenizer.pad_token = GPT2Tokenizer.eos_token
GPT2Tokenizer.padding_side = 'left'
GPT2Model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=2)
GPT2Model.load_state_dict(torch.load('Trained models/gpt.pt', map_location=torch.device(device), weights_only = True))
GPT2Model.to(device)
GPT2Model.eval()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use i

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)

In [3]:
# Creating sklearn wrapper. With predict() and predict_proba(), for hard and soft voting.
class TransformersClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, model, tokenizer, clean_text = True):
        self.model = model
        self.tokenizer = tokenizer
        self.clean_text = clean_text

    def fit(self, X, y):
        return self

    def predict(self, X, batch_size=1):
        predictions = []
        for i in tqdm(range(0, len(X), batch_size), desc="Predicting"):
            batch = X[i : i + batch_size]
            if self.clean_text:
                batch = batch.apply(clean_text)
            batch = batch.tolist()
            inputs = self.tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
            inputs = {k: v.to(device) for k, v in inputs.items()}
            with torch.no_grad():
                outputs = self.model(**inputs).logits
                batch_predictions = torch.argmax(outputs, dim=1).cpu().numpy()
            predictions.extend(batch_predictions)
        return np.array(predictions)

    def predict_proba(self, X, batch_size=1):
        probabilities = []
        for i in tqdm(range(0, len(X), batch_size), desc="Predicting Probabilities"):
            batch = X[i : i + batch_size]
            if self.clean_text:
                batch = batch.apply(clean_text)
            batch = batch.tolist()
            inputs = self.tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
            inputs = {k: v.to(device) for k, v in inputs.items()}
            with torch.no_grad():
                outputs = self.model(**inputs).logits
                batch_probabilities = torch.softmax(outputs, dim=1).cpu().numpy()
            probabilities.extend(batch_probabilities)
        return np.array(probabilities)


# Create individual classifiers
bertweet_clf = TransformersClassifier(BerTweetModel, BerTweetTokenizer, clean_text = False)
bert_clf = TransformersClassifier(BertModel, BertTokenizer, clean_text = True)
gpt_clf = TransformersClassifier(GPT2Model, GPT2Tokenizer, clean_text = True)

# Create a VotingClassifier
ensemble_clf = VotingClassifier(
    estimators=[('bertweet', bertweet_clf), ('bert', bert_clf), ('gpt', gpt_clf)],
    voting='soft', # soft means the voting is probability-based (Using predict_proba from our TransformersClassifier)
    weights = [1, 1, 1] # Weights associated with each estimator.
)

print(f"Bertweet clean_text: {bertweet_clf.clean_text}")  # Expected: False
print(f"BERT clean_text: {bert_clf.clean_text}")  # Expected: True
print(f"GPT clean_text: {gpt_clf.clean_text}")  # Expected: True


Bertweet clean_text: False
BERT clean_text: True
GPT clean_text: True


In [4]:
# fit
bertweet_clf = bertweet_clf.fit(X_train, y_train)
bert_clf = bert_clf.fit(X_train, y_train)
gpt_clf = gpt_clf.fit(X_train, y_train)
ensemble_clf = ensemble_clf.fit(X_train, y_train)

In [None]:
# evaluating and calculate accuracy on real test data

test_data = pd.read_csv('data/test.csv')
test_true_labels = pd.read_csv('data/test_answers.csv')

# merge the two by id
test_data = test_data.merge(test_true_labels, on='id')

# smaller sample (3263 is actual size)
test_data = test_data.sample(n=3263, random_state=42)

# fit the ensemble model to the text data
bertweet_y_pred = bertweet_clf.predict(test_data['text'])
bertweet_accuracy = accuracy_score(test_data['target'], bertweet_y_pred)

bert_y_pred = bert_clf.predict(test_data['text'])
bert_accuracy = accuracy_score(test_data['target'], bert_y_pred)

gpt_y_pred = gpt_clf.predict(test_data['text'])
gpt_accuracy = accuracy_score(test_data['target'], gpt_y_pred)

ensemble_y_pred = ensemble_clf.predict(test_data['text'])
ensemble_accuracy = accuracy_score(test_data['target'], ensemble_y_pred)

print(f"BERT accuracy: {bert_accuracy}")
print(f"Bertweet Accuracy: {bertweet_accuracy}")
print(f"GPT Accuracy: {gpt_accuracy}")
print(f"Ensemble Accuracy: {ensemble_accuracy}")
print(f"Classification report: {classification_report(test_data['target'], ensemble_y_pred)}")



Predicting:   0%|          | 0/3263 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Predicting:   0%|          | 0/3263 [00:00<?, ?it/s]

Predicting:   0%|          | 0/3263 [00:00<?, ?it/s]

Predicting Probabilities:   0%|          | 0/3263 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Predicting Probabilities:   0%|          | 0/3263 [00:00<?, ?it/s]

Predicting Probabilities:   0%|          | 0/3263 [00:00<?, ?it/s]

BERT accuracy: 0.8332822555930126
Bertweet Accuracy: 0.8446215139442231
GPT Accuracy: 0.8158136684033098
Ensemble Accuracy: 0.8458473797119216


ValueError: Found input variables with inconsistent numbers of samples: [1523, 3263]

In [7]:
print(f"Classification report: {classification_report(test_data['target'], ensemble_y_pred)}")

Classification report:               precision    recall  f1-score   support

           0       0.83      0.93      0.87      1861
           1       0.88      0.74      0.80      1402

    accuracy                           0.85      3263
   macro avg       0.85      0.83      0.84      3263
weighted avg       0.85      0.85      0.84      3263

