In [1]:
import gzip
import pickle

import numpy as np
import pandas as pd
import torch
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from transformers import BertForSequenceClassification
from transformers import BertTokenizer
from transformers import Pipeline

In [2]:
%load_ext autoreload
%autoreload 2

#### Custom transformers and pipelines

##### This is necessary for gzip models to be loaded successfully

In [3]:
class PipelineTransformer(BaseEstimator, TransformerMixin):
    def __init__(
            self,
            pipeline: Pipeline,
            fitted: bool = True
    ):
        self.pipeline = pipeline
        self.fitted = fitted
        self.output_n_ = ['']

    def fit(self, X: pd.DataFrame = None, y=None):
        if not self.fitted:
            self.pipeline.fit(X, y)
        return self

    def transform(self, X):
        try:
            prediction = self.pipeline.predict_proba(X)
            if self.pipeline[-1].classes_.shape[0] >= 3:
                prediction = prediction
                self.output_n_ = [f'_{i}' for i in range(self.pipeline[-1].classes_.shape[0])]
            else:
                prediction = prediction[:, 0]
        except AttributeError:
            prediction = self.pipeline.predict(X)
        return prediction

    def __sklearn_is_fitted__(self):
        return True

    def get_feature_names_out(self, input_features=None):
        return [type(self.pipeline[-1]).__name__ + f for f in
                self.output_n_]  #+ str(i) for i in self.pipeline[-1].classes_


class PipelinePredictor(BaseEstimator, TransformerMixin):
    def __init__(
            self,
            pipelines,
            model_names: list = None,
    ):
        self.pipelines = pipelines
        if model_names is None:
            self.model_names = [type(pipe[-1]).__name__ for pipe in pipelines]
        else:
            self.model_names = model_names

    def fit(self, X: pd.DataFrame, y=None):
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame or np.array:
        predictions = pd.DataFrame()
        for pipeline, name in zip(self.pipelines, self.model_names):
            predictions[name] = pipeline.predict(X)

        return predictions


class CustomColumnTransformer(ColumnTransformer):
    def fit(self, X=None, y=None):
        return self

    def fit_transform(self, X=None, y=None):
        return self.transform(X)

    def transform(self, X):
        output = []
        self.feature_names_out = []

        for name, transformer, _ in self.transformers:
            transformer_output = transformer.transform(X)
            output.append(pd.DataFrame(transformer_output))

            try:
                feature_names = transformer.get_feature_names_out()
            except AttributeError:
                feature_names = transformer_output.columns

            self.feature_names_out += (name + '_' + feat for feat in feature_names)

        return np.concatenate(output, axis=1)

    def get_feature_names_out(self, input_features=None):
        return self.feature_names_out


In [4]:
class BertClassifier():
    def __init__(self):
        state_dict = torch.load(
            'bert_model/best_model_state.bin'
            , map_location=torch.device('cpu')  # ran out of my limit on colab
        )

        # Initialize the PyTorch model
        model = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased',
            num_labels=4,
            output_attentions=False,
            output_hidden_states=False,
        )
        model.load_state_dict(state_dict)

        self.model = model
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def fit(self, df: pd.DataFrame, y=None, **fit_params):
        # No training required, just for compatibility with Pipeline
        return self

    def transform(self, X: pd.DataFrame, y=None, **transform_params):
        labels = []
        for tweet in X.tolist():
            encoding = self.tokenizer.encode_plus(
                tweet,
                add_special_tokens=True,
                max_length=12,
                truncation=True,
                padding='max_length',
                return_token_type_ids=False,
                return_attention_mask=True,
                return_tensors='pt',
            )

            input_ids = encoding['input_ids']
            attention_mask = encoding['attention_mask']

            with torch.no_grad():
                outputs = self.model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits

            predicted_label = torch.argmax(logits, dim=1).tolist()
            labels.append(predicted_label)
        return pd.DataFrame(labels, columns=['label'])

#### Load model

In [5]:
def read_gzip_model(file):
    with gzip.open(file, mode="rb") as f:
        return pickle.load(f)


model = read_gzip_model('stacking_pipeline_with_bert_Pipeline_2023-05-25.bin.gz')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [6]:
stacking_model = model.pipeline

In [7]:
stacking_model

#### Test on hate speech

In [49]:
# contains HATE SPEECH and OFFENSIVE language
# read with caution
tweets = [
    "Kids get set the wrong example when their mum goes out to work",
    "Get women out of the workplace",
    "Women shouldn't even pretend to like sport",
    "Only gay men cry",
    "my meal was fucking disgusting",
    "racism is fucking disgusting",
    "her personality is filthy lol, typical woman",
    "We had enough of refugees",
    "any normal person with a brain would agree that whatever comes out of your mouth is complete bullshi...",
    "refugees must leave our country",
    "it is our country not theirs",
    "biden needs to shoot himself",
    "All LGBT Community should die",
    "No for men in my life which has been miserable since a man enters it",
    "women are sexual objects",
    "Over 50 should stay at home as they take public transport and make us feel uncomfortable",
    "I'm sick of this fucking government"
]

data_sample = pd.DataFrame({'tweet': tweets})

In [45]:
data_sample['prediction'] = stacking_model.predict(data_sample)

In [48]:
data_sample

Unnamed: 0,tweet,prediction
0,Kids get set the wrong example when their mum ...,0
1,Get women out of the workplace,0
2,Women shouldn't even pretend to like sport,0
3,Only gay men cry,2
4,my meal was fucking disgusting,0
5,racism is fucking disgusting,2
6,"her personality is filthy lol, typical woman",0
7,We had enough of refugees,0
8,any normal person with a brain would agree tha...,2
9,refugees must leave our country,0


#### Load preprocessed data

In [12]:
X_test = pd.read_csv('data/data5_processed_with_bert_test.csv', index_col=0)
y_test = pd.read_csv('data/y5_processed_test.csv', index_col=0)

In [13]:
X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,20
0,0.859649,0.9,0.567415,1.66146,0.077533,0.803768,0.118698,0.101152,0.575513,0.323335,0.940367,0.0,1.0,-0.5038,49.0,0.0,0.0,0.0,0.0,0
1,0.628969,0.26,0.584839,2.824825,0.134313,0.716365,0.149322,0.101616,0.878575,0.019809,0.898439,0.0,1.0,-0.8493,34.0,0.0,0.0,0.0,0.0,0
2,0.176637,0.31,1.047882,0.558034,0.727103,0.217169,0.055727,0.351215,0.413401,0.235384,0.898149,0.0,0.0,-0.25,9.0,0.0,0.0,0.0,0.0,0
3,0.045129,0.39,1.525594,1.066207,0.464283,0.489319,0.046398,0.683845,0.296248,0.019907,0.693979,0.0,1.0,-0.8169,27.0,0.0,0.0,0.0,0.0,0
4,0.679891,0.39,1.526451,1.17488,0.209937,0.516017,0.274046,0.433859,0.373073,0.193068,0.363039,0.0,1.0,-0.7845,51.0,0.0,0.0,0.0,0.0,0


#### Evaluate the model

In [14]:
print(classification_report(stacking_model[-1].predict(X_test), y_test))

              precision    recall  f1-score   support

           0       0.66      0.59      0.63      1828
           1       0.60      0.63      0.61      1190
           2       0.40      0.45      0.42      1012

    accuracy                           0.57      4030
   macro avg       0.55      0.56      0.55      4030
weighted avg       0.58      0.57      0.57      4030





In [33]:
# LR
print(classification_report(stacking_model[-1].estimators_[0].predict(X_test), y_test))

              precision    recall  f1-score   support

           0       0.60      0.54      0.57      1821
           1       0.56      0.52      0.54      1342
           2       0.34      0.45      0.38       867

    accuracy                           0.51      4030
   macro avg       0.50      0.50      0.50      4030
weighted avg       0.53      0.51      0.52      4030





In [34]:
# RFC
print(classification_report(stacking_model[-1].estimators_[1].predict(X_test), y_test))

              precision    recall  f1-score   support

           0       0.67      0.59      0.63      1851
           1       0.62      0.62      0.62      1242
           2       0.38      0.47      0.42       937

    accuracy                           0.57      4030
   macro avg       0.56      0.56      0.55      4030
weighted avg       0.59      0.57      0.58      4030





In [35]:
# XGB
print(classification_report(stacking_model[-1].estimators_[2].predict(X_test), y_test))

              precision    recall  f1-score   support

           0       0.65      0.59      0.62      1801
           1       0.61      0.62      0.61      1242
           2       0.39      0.46      0.42       987

    accuracy                           0.57      4030
   macro avg       0.55      0.56      0.55      4030
weighted avg       0.58      0.57      0.57      4030



In [36]:
X_train = pd.read_csv('data/data5_processed_with_bert_train.csv', index_col=0)
y_train = pd.read_csv('data/y5_processed_train.csv', index_col=0)

set,train,train,train,train,test,test,test,test
metric,f1_score,cohen_kappa_score,precision_score,recall_score,f1_score,cohen_kappa_score,precision_score,recall_score
layer2_lr,0.511,0.26,0.511,0.516,0.508,0.255,0.508,0.514
layer2_rfc,0.753,0.623,0.757,0.754,0.565,0.342,0.564,0.57
layer2_xgb,0.823,0.731,0.824,0.823,0.563,0.337,0.561,0.567
layer3_lr,0.833,0.746,0.835,0.833,0.564,0.338,0.563,0.567
