In [1]:
import gzip
import pickle
import time
import warnings

import eli5
import numpy as np
import pandas as pd
import torch
from imblearn.over_sampling import RandomOverSampler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from transformers import BertForSequenceClassification
from transformers import BertTokenizer
from xgboost import XGBClassifier

from notebooks.Dzim.web_mining.bert_processor import TextCleaner, SeriesConverter # choose your path
from notebooks.Dzim.web_mining.feature_generation import FeatureGenerator # choose your path

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
%load_ext autoreload
%autoreload 2

In [4]:
filenames = [
    # data 1
    "l1_data1_lr_Pipeline_2023-05-16.bin.gz",
    "l1_data1_rfc_Pipeline_2023-05-16.bin.gz",

    # data 2 regression
    "l1_data2_ctb_reg_hate_speech_2023-05-17.bin.gz",
    "l1_data2_lgbm_reg_offensive_language_2023-05-17.bin.gz",

    # data 2 regression
    "l1_data2_lr_clf_Pipeline_2023-05-23.bin.gz",
    "l1_data2_ctb_clf_Pipeline_2023-05-23.bin.gz",

    # data 3
    "l1_data3_ctb_Pipeline_2023-05-17.bin.gz",
]

In [5]:
def read_gzip_model(file):
    with gzip.open(file, mode="rb") as f:
        return pickle.load(f)

In [7]:
models = [read_gzip_model('models/' + filename).pipeline for filename in filenames]

In [7]:
# models

In [8]:
# models[0]

In [9]:
# models[-1].predict(data_sample)

In [10]:
class PipelineTransformer(BaseEstimator, TransformerMixin):
    def __init__(
            self,
            pipeline: Pipeline,
            fitted: bool = True
    ):
        self.pipeline = pipeline
        self.fitted = fitted
        self.output_n_ = ['']

    def fit(self, X: pd.DataFrame = None, y=None):
        if not self.fitted:
            self.pipeline.fit(X, y)
        return self

    def transform(self, X):
        try:
            prediction = self.pipeline.predict_proba(X)
            if self.pipeline[-1].classes_.shape[0] >= 3:
                prediction = prediction
                self.output_n_ = [f'_{i}' for i in range(self.pipeline[-1].classes_.shape[0])]
            else:
                prediction = prediction[:, 0]
        except AttributeError:
            prediction = self.pipeline.predict(X)
        return prediction

    def __sklearn_is_fitted__(self):
        return True

    def get_feature_names_out(self, input_features=None):
        return [type(self.pipeline[-1]).__name__ + f for f in
                self.output_n_]  #+ str(i) for i in self.pipeline[-1].classes_


class PipelinePredictor(BaseEstimator, TransformerMixin):
    def __init__(
            self,
            pipelines,
            model_names: list = None,
    ):
        self.pipelines = pipelines
        if model_names is None:
            self.model_names = [type(pipe[-1]).__name__ for pipe in pipelines]
        else:
            self.model_names = model_names

    def fit(self, X: pd.DataFrame, y=None):
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame or np.array:
        predictions = pd.DataFrame()
        for pipeline, name in zip(self.pipelines, self.model_names):
            predictions[name] = pipeline.predict(X)

        return predictions


class CustomColumnTransformer(ColumnTransformer):
    def fit(self, X=None, y=None):
        return self

    def fit_transform(self, X=None, y=None):
        return self.transform(X)

    def transform(self, X):
        output = []
        self.feature_names_out = []

        for name, transformer, _ in self.transformers:
            transformer_output = transformer.transform(X)
            output.append(pd.DataFrame(transformer_output))

            try:
                feature_names = transformer.get_feature_names_out()
            except AttributeError:
                feature_names = transformer_output.columns

            self.feature_names_out += (name + '_' + feat for feat in feature_names)

        return np.concatenate(output, axis=1)

    def get_feature_names_out(self, input_features=None):
        return self.feature_names_out


In [11]:


class BertClassifier():
    def __init__(self):
        state_dict = torch.load(
            'bert_model/best_model_state.bin'
            , map_location=torch.device('cpu')  # ran out of my limit on colab
        )

        # Initialize the PyTorch model
        model = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased',
            num_labels=4,
            output_attentions=False,
            output_hidden_states=False,
        )
        model.load_state_dict(state_dict)

        self.model = model
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def fit(self, df: pd.DataFrame, y=None, **fit_params):
        # No training required, just for compatibility with Pipeline
        return self

    def transform(self, X: pd.DataFrame, y=None, **transform_params):
        labels = []
        for tweet in X.tolist():
            encoding = self.tokenizer.encode_plus(
                tweet,
                add_special_tokens=True,
                max_length=12,
                truncation=True,
                padding='max_length',
                return_token_type_ids=False,
                return_attention_mask=True,
                return_tensors='pt',
            )

            input_ids = encoding['input_ids']
            attention_mask = encoding['attention_mask']

            with torch.no_grad():
                outputs = self.model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits

            predicted_label = torch.argmax(logits, dim=1).tolist()
            labels.append(predicted_label)
        return pd.DataFrame(labels, columns=['label'])

In [12]:
bert_pipeline = Pipeline([
    ("cleaning", TextCleaner()),
    ("series_converter", SeriesConverter()),
    ('bert_classifier', BertClassifier()),
])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [13]:
bert_pipeline

In [43]:
d1_lr = PipelineTransformer(models[0])
d1_rfc = PipelineTransformer(models[1])

d2_ctb_reg = PipelineTransformer(models[2])
d2_lgbm_reg = PipelineTransformer(models[3])

d2_lr_clf = PipelineTransformer(models[4])
d2_ctb_clf = PipelineTransformer(models[5])

d3_ctb = PipelineTransformer(models[6])

fg = FeatureGenerator()

# pp = PipelinePredictor(models)

In [44]:
l1 = CustomColumnTransformer(
    transformers=[
        ('d1_lr', d1_lr, ['tweet']),
        ('d1_rfc', d1_rfc, ['tweet']),

        ('d2_ctb_reg', d2_ctb_reg, ['tweet']),
        ('d2_lgbm_reg', d2_lgbm_reg, ['tweet']),

        ('d2_lr_clf', d2_lr_clf, ['tweet']),
        ('d2_ctb_clf', d2_ctb_clf, ['tweet']),

        ('d3_ctb', d3_ctb, ['tweet']),

        ('feat_gen', fg, ['tweet']),

        ('bert_clf', bert_pipeline, ['tweet']),
    ],
    remainder='drop'
)

#### Test data pipeline

In [None]:
data_sample = pd.DataFrame({
    'tweet': [
        'Sasha goes to Mannheim University',
        'Danylo will create a great application',
        'Danylo will not create a great application',
        'Dasha didnt fulfill her task',
        'you are a good person',
    ],
    'label': [1, 1, 0, 0, 1, 1, 0]
})

In [45]:
l1.transform(data_sample)

array([[ 9.44993456e-01,  9.80000000e-01,  1.50488140e-01,
         9.28453823e-01,  5.50323874e-02,  4.47867006e-01,
         4.97100607e-01,  5.66561594e-02,  3.89024833e-01,
         5.54319008e-01,  9.54738592e-01,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  5.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  1.00000000e+00],
       [ 9.30165996e-01,  1.00000000e+00,  1.66477290e-01,
         8.74582564e-01,  2.87726475e-02,  2.55274268e-01,
         7.15953084e-01,  5.29495117e-02,  3.61351648e-01,
         5.85698840e-01,  9.29857342e-01,  0.00000000e+00,
         0.00000000e+00,  7.35100000e-01,  6.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  1.00000000e+00],
       [ 9.30165996e-01,  1.00000000e+00,  1.66477290e-01,
         8.74582564e-01,  2.87726475e-02,  2.55274268e-01,
         7.15953084e-01,  5.29495117e-02,  3.61351648e-01,
         5.85698840e-01,  9.2

In [46]:
l1.fit_transform(data_sample)

array([[ 9.44993456e-01,  9.80000000e-01,  1.50488140e-01,
         9.28453823e-01,  5.50323874e-02,  4.47867006e-01,
         4.97100607e-01,  5.66561594e-02,  3.89024833e-01,
         5.54319008e-01,  9.54738592e-01,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  5.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  1.00000000e+00],
       [ 9.30165996e-01,  1.00000000e+00,  1.66477290e-01,
         8.74582564e-01,  2.87726475e-02,  2.55274268e-01,
         7.15953084e-01,  5.29495117e-02,  3.61351648e-01,
         5.85698840e-01,  9.29857342e-01,  0.00000000e+00,
         0.00000000e+00,  7.35100000e-01,  6.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  1.00000000e+00],
       [ 9.30165996e-01,  1.00000000e+00,  1.66477290e-01,
         8.74582564e-01,  2.87726475e-02,  2.55274268e-01,
         7.15953084e-01,  5.29495117e-02,  3.61351648e-01,
         5.85698840e-01,  9.2

In [47]:
l1

In [48]:
l1.get_feature_names_out()

['d1_lr_LogisticRegression',
 'd1_rfc_RandomForestClassifier',
 'd2_ctb_reg_CatBoostRegressor',
 'd2_lgbm_reg_LGBMRegressor',
 'd2_lr_clf_LogisticRegression_0',
 'd2_lr_clf_LogisticRegression_1',
 'd2_lr_clf_LogisticRegression_2',
 'd2_ctb_clf_CatBoostClassifier_0',
 'd2_ctb_clf_CatBoostClassifier_1',
 'd2_ctb_clf_CatBoostClassifier_2',
 'd3_ctb_CatBoostClassifier',
 'feat_gen_rt_flag',
 'feat_gen_contains_profanity_words_flag',
 'feat_gen_sentiment_score_of_row',
 'feat_gen_number_of_words_in_row',
 'feat_gen_number_of_commas',
 'feat_gen_number_of_excl_points',
 'feat_gen_number_of_question_marks',
 'feat_gen_number_of_full_stops',
 'bert_clf_label']

#### Assemble 2nd and 3d layers

In [199]:
meta_classifier = LogisticRegression()

In [200]:
stacking_model = StackingClassifier(
    estimators=[
        ('logistic_regression',
         LogisticRegression(C=0.001, fit_intercept=True, max_iter=100, penalty='l2', solver='newton-cg')),
        ('random_forest',
         RandomForestClassifier(max_depth=12, min_samples_leaf=4, min_samples_split=12, n_estimators=128)),
        ('xgboost',
         XGBClassifier(learning_rate=0.1, max_depth=8, min_child_weight=8, n_estimators=128, tree_method='gpu_hist'))
    ],
    final_estimator=meta_classifier,
    cv=None  # Set your desired cross-validation strategy here
)

In [201]:
stacking_model

#### Load data

In [22]:
data = pd.read_csv('data/data5.csv', index_col=0)

In [23]:
data.head()

Unnamed: 0_level_0,Unnamed: 0,label,tokens,text
Unnamed: 0.1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1179055004553900032_twitter,0,"['i', 'dont', 'think', 'im', 'getting', 'my', ...",i dont think im getting my baby them white 9 h...
1,1179063826874032128_twitter,0,"['we', 'cannot', 'continue', 'calling', 'ourse...",we cannot continue calling ourselves feminists...
2,1178793830532956161_twitter,0,"['nawt', 'yall', 'niggers', 'ignoring', 'me']",nawt yall niggers ignoring me
3,1179088797964763136_twitter,1,"['<user>', 'i', 'am', 'bit', 'confused', 'coz'...",<user> i am bit confused coz chinese ppl can n...
4,1179085312976445440_twitter,1,"['this', 'bitch', 'in', 'whataburger', 'eating...",this bitch in whataburger eating a burger with...


In [26]:
target = 'label'

In [27]:
data.columns = ['id', 'label', 'tokens', 'tweet']

In [32]:
data_train, data_test, y_train, y_test = train_test_split(data[['tweet']], data[target], test_size=0.2, random_state=42,
                                                          stratify=data[target])

In [35]:
data_train.shape

(16118, 1)

In [None]:
X_train = l1.transform(data_train)

In [40]:
X_test = l1.transform(data_test)

In [86]:
X_train.shape

(16118, 20)

In [87]:
y_train.shape

(16118, 1)

##### Oversampling

In [89]:
oversampler = RandomOverSampler()
X_train_sampled, y_train_sampled = oversampler.fit_resample(X_train.values, y_train)

In [90]:
X_train_sampled.shape

(19566, 20)

In [92]:
y_train_sampled.shape

(16118, 1)

(19566, 1)

In [57]:
X_train.shape, X_test.shape

((16118, 20), (4030, 20))

In [43]:
y_train.value_counts()

0    6522
1    4987
2    4609
Name: label, dtype: int64

In [44]:
y_test.value_counts()

0    1631
1    1247
2    1152
Name: label, dtype: int64

#### Fit 2nd and 3d layers

In [202]:
stacking_model.fit(X_train_sampled, y_train_sampled)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [203]:
# test
prediction = stacking_model.predict(X_test)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(classification_report(prediction, y_test))
    print(confusion_matrix(prediction, y_test))

              precision    recall  f1-score   support

           0       0.66      0.59      0.63      1828
           1       0.60      0.63      0.61      1190
           2       0.40      0.45      0.42      1012

    accuracy                           0.57      4030
   macro avg       0.55      0.56      0.55      4030
weighted avg       0.58      0.57      0.57      4030

[[1082  293  453]
 [ 202  747  241]
 [ 347  207  458]]




In [204]:
# train
train_prediction = stacking_model.predict(X_train)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(classification_report(train_prediction, y_train))
    print(confusion_matrix(train_prediction, y_train))



              precision    recall  f1-score   support

           0       0.88      0.81      0.84      7071
           1       0.82      0.86      0.84      4753
           2       0.78      0.84      0.81      4294

    accuracy                           0.83     16118
   macro avg       0.83      0.84      0.83     16118
weighted avg       0.84      0.83      0.83     16118

[[5737  613  721]
 [ 395 4083  275]
 [ 390  291 3613]]


In [205]:
# train sampled
train_prediction = stacking_model.predict(X_train_sampled)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(classification_report(train_prediction, y_train_sampled))
    print(confusion_matrix(train_prediction, y_train_sampled))

              precision    recall  f1-score   support

           0       0.88      0.79      0.83      7272
           1       0.85      0.89      0.87      6233
           2       0.82      0.88      0.85      6061

    accuracy                           0.85     19566
   macro avg       0.85      0.85      0.85     19566
weighted avg       0.85      0.85      0.85     19566

[[5737  687  848]
 [ 395 5518  320]
 [ 390  317 5354]]


In [206]:
l1.transform(data_train[:1])
eli5.explain_weights_df(stacking_model.estimators_[0], feature_names=l1.get_feature_names_out())

Unnamed: 0,target,feature,weight
0,0,<BIAS>,0.452943
1,0,bert_clf_label,0.187097
2,0,d3_ctb_CatBoostClassifier,0.085836
3,0,feat_gen_sentiment_score_of_row,0.076939
4,0,d2_ctb_clf_CatBoostClassifier_2,0.057535
5,0,d2_lr_clf_LogisticRegression_1,0.039857
6,0,d1_rfc_RandomForestClassifier,0.02734
7,0,d2_ctb_clf_CatBoostClassifier_1,0.002276
8,0,feat_gen_number_of_question_marks,-0.000374
9,0,feat_gen_number_of_words_in_row,-0.00121


In [207]:
l1.transform(data_train[:1])
eli5.explain_weights_df(stacking_model.estimators_[1], feature_names=l1.get_feature_names_out())

Unnamed: 0,feature,weight,std
0,d2_ctb_reg_CatBoostRegressor,0.136985,0.062282
1,d3_ctb_CatBoostClassifier,0.105414,0.013528
2,d2_ctb_clf_CatBoostClassifier_0,0.087115,0.049314
3,d2_lgbm_reg_LGBMRegressor,0.076957,0.020322
4,d2_ctb_clf_CatBoostClassifier_2,0.074613,0.024408
5,d2_ctb_clf_CatBoostClassifier_1,0.07101,0.023439
6,d1_rfc_RandomForestClassifier,0.068904,0.025091
7,d2_lr_clf_LogisticRegression_0,0.06174,0.025801
8,d1_lr_LogisticRegression,0.056085,0.010253
9,d2_lr_clf_LogisticRegression_2,0.054591,0.013773


In [208]:
l1.transform(data_train[:1])
eli5.explain_weights_df(stacking_model.estimators_[2], feature_names=l1.get_feature_names_out())

TypeError: 'NoneType' object is not iterable

In [209]:
eli5.explain_weights_df(stacking_model.final_estimator_, feature_names=stacking_model.get_feature_names_out())

Unnamed: 0,target,feature,weight
0,0,stackingclassifier_xgboost0,1.486672
1,0,stackingclassifier_random_forest0,1.028154
2,0,stackingclassifier_logistic_regression2,0.766307
3,0,stackingclassifier_logistic_regression1,0.123963
4,0,<BIAS>,0.030705
5,0,stackingclassifier_random_forest2,-0.467143
6,0,stackingclassifier_random_forest1,-0.538316
7,0,stackingclassifier_xgboost2,-0.583588
8,0,stackingclassifier_logistic_regression0,-0.867576
9,0,stackingclassifier_xgboost1,-0.880389


#### Unite 1st layer with 2nd and 3d layers

In [211]:
stacking_pipeline = Pipeline([
    ('l1', l1),
    ('regressor', stacking_model)
])

In [212]:
stacking_pipeline

In [215]:
dummy_median = make_pipeline(
    DummyClassifier(strategy='stratified'),
)

In [216]:
dummy_median.fit(X_train, y_train)

In [217]:
prediction = dummy_median.predict(X_test)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(classification_report(prediction, y_test))
    print(confusion_matrix(prediction, y_test))

              precision    recall  f1-score   support

           0       0.40      0.40      0.40      1636
           1       0.29      0.31      0.30      1165
           2       0.31      0.29      0.30      1229

    accuracy                           0.34      4030
   macro avg       0.33      0.33      0.33      4030
weighted avg       0.34      0.34      0.34      4030

[[654 525 457]
 [468 362 335]
 [509 360 360]]


In [220]:
data_sample

Unnamed: 0,tweet,label
0,Sasha goes to Mannheim University,1
1,Danylo will create a great application,1
2,Danylo will not create a great application,0
3,Dasha didnt fulfill her task,0
4,"You suck a dick, nigga",1
5,fuck jews idk,1
6,you are a good person,0


In [221]:
stacking_pipeline.predict_proba(data_sample)

array([[0.55934579, 0.30049021, 0.140164  ],
       [0.61291008, 0.17353395, 0.21355597],
       [0.61829964, 0.20173184, 0.17996852],
       [0.55330361, 0.28830191, 0.15839448],
       [0.63120178, 0.25194996, 0.11684826],
       [0.05304254, 0.8806862 , 0.06627126],
       [0.32551952, 0.33887558, 0.3356049 ]])

In [222]:
stacking_pipeline.predict(data_sample)

array([0, 0, 0, 0, 0, 1, 1], dtype=int64)

#### Hyperparameter optimization

In [None]:
# Define the parameter grid to search over
param_grid = {
    'n_estimators': [128, 256],
    'max_depth': [2, 4, 8, 12],
    'min_samples_split': [12, 16, 20],
    'min_samples_leaf': [4],
}

# Create a RandomForestClassifier object
model = RandomForestClassifier()

param_grid = {
    'max_depth': [2, 4, 8],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [128, 256],
    'min_child_weight': [1, 2, 4, 8],
    'tree_method': ['gpu_hist'],
}

model = XGBClassifier()

param_grid = {'penalty': ['l2', 'elasticnet'],
              'C': [0.001, 0.01, 0.1, 1, 100],
              'fit_intercept': [True, False],
              'solver': ['newton-cg', 'lbfgs', 'liblinear'],
              'max_iter': [100, 1000, 5000]}

model = LogisticRegression()

scorer = make_scorer(f1_score, average='weighted')

# Create a GridSearchCV object and fit the data
start = time.time()
grid_search = GridSearchCV(model, param_grid=param_grid, cv=3, n_jobs=-1, scoring=scorer)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and accuracy score
print("Best parameters: ", grid_search.best_params_)
print("Best accuracy: ", grid_search.best_score_)
end = time.time()
print(f"Computation tmie: {end - start}")