In [1]:
import eli5
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

from notebooks.Dzim.web_mining.feature_generation import FeatureGenerator
from src.common.prediction_model.persistence import FsModelPersistence

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
%load_ext autoreload
%autoreload 2

In [66]:
filenames = [
    # data 1
    "l1_data1_lr_Pipeline_2023-05-16.bin.gz",
    "l1_data1_rfc_Pipeline_2023-05-16.bin.gz",

    # data 2 regression
    "l1_data2_ctb_reg_hate_speech_2023-05-17.bin.gz",
    "l1_data2_lgbm_reg_offensive_language_2023-05-17.bin.gz",

    # data 2 regression
    "l1_data2_lr_clf_Pipeline_2023-05-23.bin.gz",
    "l1_data2_ctb_clf_Pipeline_2023-05-23.bin.gz",

    # data 3
    "l1_data3_ctb_Pipeline_2023-05-17.bin.gz",
]

In [67]:
data_sample = pd.DataFrame({
    'tweet': [
        'Sasha goes to Mannheim University',
        'Danylo will create a great application',
        'Danylo will not create a great application',
        'Dasha didnt fulfill her task',
        'You suck a dick, nigga',
        'fuck jews idk',
        'you are a good person',
    ],
    'label': [1, 1, 0, 0, 1, 1, 0]
})

In [68]:
persistor = FsModelPersistence()

In [69]:
models = [persistor.read(filename).pipeline for filename in filenames]

In [70]:
# models

In [71]:
# models[0]

In [72]:
# models[-1].predict(data_sample)

In [73]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin


class PipelineTransformer(BaseEstimator, TransformerMixin):
    def __init__(
            self,
            pipeline: Pipeline,
            fitted: bool = True
    ):
        self.pipeline = pipeline
        self.fitted = fitted
        self.output_n_ = ['']

    def fit(self, X: pd.DataFrame = None, y=None):
        if not self.fitted:
            self.pipeline.fit(X, y)
        return self

    def transform(self, X):
        try:
            prediction = self.pipeline.predict_proba(X)
            if self.pipeline[-1].classes_.shape[0] >= 3:
                prediction = prediction
                self.output_n_ = [f'_{i}' for i in range(self.pipeline[-1].classes_.shape[0])]
            else:
                prediction = prediction[:, 0]
        except AttributeError:
            prediction = self.pipeline.predict(X)
        return prediction

    def __sklearn_is_fitted__(self):
        return True

    def get_feature_names_out(self, input_features=None):
        return [type(self.pipeline[-1]).__name__ + f for f in self.output_n_]  #+ str(i) for i in self.pipeline[-1].classes_


class PipelinePredictor(BaseEstimator, TransformerMixin):
    def __init__(
            self,
            pipelines,
            model_names: list = None,
    ):
        self.pipelines = pipelines
        if model_names is None:
            self.model_names = [type(pipe[-1]).__name__ for pipe in pipelines]
        else:
            self.model_names = model_names

    def fit(self, X: pd.DataFrame, y=None):
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame or np.array:
        predictions = pd.DataFrame()
        for pipeline, name in zip(self.pipelines, self.model_names):
            predictions[name] = pipeline.predict(X)

        return predictions


class CustomColumnTransformer(ColumnTransformer):
    def fit(self, X=None, y=None):
        return self

    def fit_transform(self, X=None, y=None):
        return self.transform(X)

    def transform(self, X):
        output = []
        self.feature_names_out = []

        for name, transformer, _ in self.transformers:
            transformer_output = transformer.transform(X)
            output.append(pd.DataFrame(transformer_output))

            try:
                feature_names = transformer.get_feature_names_out()
            except AttributeError:
                feature_names = transformer_output.columns

            self.feature_names_out += (name + '_' + feat for feat in feature_names)

        return np.concatenate(output, axis=1)

    def get_feature_names_out(self, input_features=None):
        return self.feature_names_out


In [74]:
d1_lr = PipelineTransformer(models[0])
d1_rfc = PipelineTransformer(models[1])

d2_ctb_reg = PipelineTransformer(models[2])
d2_lgbm_reg = PipelineTransformer(models[3])

d2_lr_clf = PipelineTransformer(models[4])
d2_ctb_clf = PipelineTransformer(models[5])

d3_ctb = PipelineTransformer(models[6])

fg = FeatureGenerator()

# pp = PipelinePredictor(models)

In [75]:
l1 = CustomColumnTransformer(
    transformers=[
        ('d1_lr', d1_lr, ['tweet']),
        ('d1_rfc', d1_rfc, ['tweet']),

        ('d2_ctb_reg', d2_ctb_reg, ['tweet']),
        ('d2_lgbm_reg', d2_lgbm_reg, ['tweet']),

        ('d2_lr_clf', d2_lr_clf, ['tweet']),
        ('d2_ctb_clf', d2_ctb_clf, ['tweet']),

        ('d3_ctb', d3_ctb, ['tweet']),

        ('feat_gen', fg, ['tweet']),
    ],
    remainder='drop'
)

In [13]:
l1.transform(data_sample)

array([[ 9.44993456e-01,  9.80000000e-01,  1.50488140e-01,
         9.28453823e-01,  5.50323874e-02,  4.47867006e-01,
         4.97100607e-01,  5.66561594e-02,  3.89024833e-01,
         5.54319008e-01,  9.54738592e-01,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  5.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 9.30165996e-01,  1.00000000e+00,  1.66477290e-01,
         8.74582564e-01,  2.87726475e-02,  2.55274268e-01,
         7.15953084e-01,  5.29495117e-02,  3.61351648e-01,
         5.85698840e-01,  9.29857342e-01,  0.00000000e+00,
         0.00000000e+00,  7.35100000e-01,  6.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 9.30165996e-01,  1.00000000e+00,  1.66477290e-01,
         8.74582564e-01,  2.87726475e-02,  2.55274268e-01,
         7.15953084e-01,  5.29495117e-02,  3.61351648e-01,
         5.85698840e-01,  9.29857342e-01,  0.00000000e+00,
    

In [14]:
l1.fit_transform(data_sample)

array([[ 9.44993456e-01,  9.80000000e-01,  1.50488140e-01,
         9.28453823e-01,  5.50323874e-02,  4.47867006e-01,
         4.97100607e-01,  5.66561594e-02,  3.89024833e-01,
         5.54319008e-01,  9.54738592e-01,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  5.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 9.30165996e-01,  1.00000000e+00,  1.66477290e-01,
         8.74582564e-01,  2.87726475e-02,  2.55274268e-01,
         7.15953084e-01,  5.29495117e-02,  3.61351648e-01,
         5.85698840e-01,  9.29857342e-01,  0.00000000e+00,
         0.00000000e+00,  7.35100000e-01,  6.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 9.30165996e-01,  1.00000000e+00,  1.66477290e-01,
         8.74582564e-01,  2.87726475e-02,  2.55274268e-01,
         7.15953084e-01,  5.29495117e-02,  3.61351648e-01,
         5.85698840e-01,  9.29857342e-01,  0.00000000e+00,
    

In [15]:
l1

In [16]:
l1.get_feature_names_out()

['d1_lr_LogisticRegression',
 'd1_rfc_RandomForestClassifier',
 'd2_ctb_reg_CatBoostRegressor',
 'd2_lgbm_reg_LGBMRegressor',
 'd2_lr_clf_LogisticRegression_0',
 'd2_lr_clf_LogisticRegression_1',
 'd2_lr_clf_LogisticRegression_2',
 'd2_ctb_clf_CatBoostClassifier_0',
 'd2_ctb_clf_CatBoostClassifier_1',
 'd2_ctb_clf_CatBoostClassifier_2',
 'd3_ctb_CatBoostClassifier',
 'feat_gen_rt_flag',
 'feat_gen_contains_profanity_words_flag',
 'feat_gen_sentiment_score_of_row',
 'feat_gen_number_of_words_in_row',
 'feat_gen_number_of_commas',
 'feat_gen_number_of_excl_points',
 'feat_gen_number_of_question_marks',
 'feat_gen_number_of_full_stops']

In [66]:
# l1 = ColumnTransformer(
#     transformers=[
#         ('l1_p1', pp, ['tweet']),
#         # ('l1_p2', p2, ['tweet']),
#         # ('l1_p3', p3, ['tweet']),
#         # ('l1_p4', p4, ['tweet']),
#         ('l1_g1', FeatureGenerator(), ['tweet']),
#     ],
#     remainder='drop'
# )

In [17]:
meta_classifier = DecisionTreeClassifier()

In [18]:
stacking_model = StackingClassifier(
    estimators=[
        ('logistic_regression', LogisticRegression()),
        ('random_forest', RandomForestClassifier()),
        ('xgboost', XGBClassifier(tree_method='gpu_hist'))
    ],
    final_estimator=meta_classifier,
    cv=None  # Set your desired cross-validation strategy here
)

In [19]:
stacking_model

#### Training

In [22]:
data = pd.read_csv('data5.csv', index_col=0)

In [23]:
data.head()

Unnamed: 0_level_0,Unnamed: 0,label,tokens,text
Unnamed: 0.1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1179055004553900032_twitter,0,"['i', 'dont', 'think', 'im', 'getting', 'my', ...",i dont think im getting my baby them white 9 h...
1,1179063826874032128_twitter,0,"['we', 'cannot', 'continue', 'calling', 'ourse...",we cannot continue calling ourselves feminists...
2,1178793830532956161_twitter,0,"['nawt', 'yall', 'niggers', 'ignoring', 'me']",nawt yall niggers ignoring me
3,1179088797964763136_twitter,1,"['<user>', 'i', 'am', 'bit', 'confused', 'coz'...",<user> i am bit confused coz chinese ppl can n...
4,1179085312976445440_twitter,1,"['this', 'bitch', 'in', 'whataburger', 'eating...",this bitch in whataburger eating a burger with...


In [26]:
target = 'label'

In [27]:
# data.columns = ['tweet', 'label', 'label_text']
data.columns = ['id', 'label', 'tokens', 'tweet']

In [32]:
data_train, data_test, y_train, y_test = train_test_split(data[['tweet']], data[target], test_size=0.2, random_state=42, stratify=data[target])

In [35]:
data_train.shape

(16118, 1)

In [36]:
import time

In [38]:
# Fit-transform the training data
start = time.time()
X_train = l1.transform(data_train)  #.toarray()
end = time.time()
print(f"Computation time: {end - start}")

Computation time: 1201.6027507781982


In [39]:
pd.DataFrame(X_train).to_csv('data5_processed_train.csv')

In [82]:
pd.DataFrame(y_train).to_csv('y5_processed_train.csv')

In [40]:
# Transform the test data
X_test = l1.transform(data_test)

In [41]:
pd.DataFrame(X_test).to_csv('data5_processed_test.csv')

In [84]:
pd.DataFrame(y_test).to_csv('y5_processed_test.csv')

In [42]:
X_train.shape, X_test.shape

((16118, 19), (4030, 19))

In [43]:
y_train.value_counts()

0    6522
1    4987
2    4609
Name: label, dtype: int64

In [44]:
y_test.value_counts()

0    1631
1    1247
2    1152
Name: label, dtype: int64

In [45]:
stacking_model.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [46]:
prediction = stacking_model.predict(X_test)
import warnings
from sklearn.metrics import classification_report, confusion_matrix

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(classification_report(prediction, y_test))
    print(confusion_matrix(prediction, y_test))

              precision    recall  f1-score   support

           0       0.48      0.50      0.49      1540
           1       0.49      0.48      0.48      1283
           2       0.37      0.36      0.36      1207

    accuracy                           0.45      4030
   macro avg       0.45      0.45      0.45      4030
weighted avg       0.45      0.45      0.45      4030

[[776 348 416]
 [365 611 307]
 [490 288 429]]


In [47]:
train_prediction = stacking_model.predict(X_train)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(classification_report(train_prediction, y_train))
    print(confusion_matrix(train_prediction, y_train))

              precision    recall  f1-score   support

           0       0.60      0.57      0.59      6840
           1       0.61      0.64      0.62      4786
           2       0.50      0.51      0.50      4492

    accuracy                           0.58     16118
   macro avg       0.57      0.57      0.57     16118
weighted avg       0.58      0.58      0.58     16118

[[3930 1291 1619]
 [1045 3045  696]
 [1547  651 2294]]


In [48]:
l1.transform(data_train[:1])
eli5.explain_weights_df(stacking_model.estimators_[0], feature_names=l1.get_feature_names_out())

Unnamed: 0,target,feature,weight
0,0,d3_ctb_CatBoostClassifier,0.824944
1,0,d2_lr_clf_LogisticRegression_0,0.503284
2,0,d2_lr_clf_LogisticRegression_1,0.484385
3,0,<BIAS>,0.467463
4,0,d2_ctb_clf_CatBoostClassifier_1,0.243733
5,0,feat_gen_sentiment_score_of_row,0.163357
6,0,d2_ctb_clf_CatBoostClassifier_2,0.150421
7,0,d1_rfc_RandomForestClassifier,0.105718
8,0,d2_ctb_clf_CatBoostClassifier_0,0.070542
9,0,feat_gen_number_of_words_in_row,-0.001142


In [49]:
l1.transform(data_train[:1])
eli5.explain_weights_df(stacking_model.estimators_[1], feature_names=l1.get_feature_names_out())

Unnamed: 0,feature,weight,std
0,d2_ctb_reg_CatBoostRegressor,0.101955,0.022935
1,d3_ctb_CatBoostClassifier,0.095022,0.005776
2,d2_lgbm_reg_LGBMRegressor,0.078517,0.010335
3,d2_ctb_clf_CatBoostClassifier_0,0.075163,0.017128
4,d1_lr_LogisticRegression,0.074844,0.004811
5,d1_rfc_RandomForestClassifier,0.071972,0.009527
6,d2_ctb_clf_CatBoostClassifier_2,0.071823,0.013186
7,d2_lr_clf_LogisticRegression_0,0.071465,0.009053
8,d2_ctb_clf_CatBoostClassifier_1,0.071368,0.010077
9,d2_lr_clf_LogisticRegression_2,0.068695,0.005379


In [50]:
eli5.explain_weights_df(stacking_model.final_estimator_, feature_names=stacking_model.get_feature_names_out())

Unnamed: 0,feature,weight
0,stackingclassifier_random_forest1,0.203756
1,stackingclassifier_xgboost1,0.114035
2,stackingclassifier_logistic_regression2,0.111675
3,stackingclassifier_xgboost0,0.10728
4,stackingclassifier_logistic_regression0,0.102618
5,stackingclassifier_xgboost2,0.101834
6,stackingclassifier_logistic_regression1,0.101624
7,stackingclassifier_random_forest0,0.08905
8,stackingclassifier_random_forest2,0.068126


In [51]:
stacking_model.final_estimator_

In [55]:
stacking_pipeline = Pipeline([
    ('l1', l1),
    ('regressor', stacking_model)
])

In [56]:
stacking_pipeline

In [64]:
from src.common.prediction_model.persistence import FsModelPersistence
from src.common.prediction_model.prediction_model import ModelContainer, ModelMetadata

container = ModelContainer.create(
    pipeline_name='stacking_pipeline',
    pipeline=stacking_pipeline,
    feature_names=[],
    metadata=ModelMetadata(
        model_name=str(stacking_pipeline.__class__.__name__),
    )
)
# %%

In [65]:
persistor = FsModelPersistence()
name = persistor.save(container)

In [78]:
m = persistor.read('stacking_pipeline_Pipeline_2023-05-23.bin.gz')

In [81]:
m.pipeline.predict(data_sample)

array([2, 2, 2, 1, 1, 1, 0], dtype=int64)

In [52]:
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import make_pipeline

dummy_median = make_pipeline(
    DummyClassifier(strategy='stratified'),
)

In [53]:
dummy_median.fit(X_train, y_train)

In [54]:
prediction = dummy_median.predict(X_test)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(classification_report(prediction, y_test))
    print(confusion_matrix(prediction, y_test))

              precision    recall  f1-score   support

           0       0.38      0.40      0.39      1577
           1       0.32      0.30      0.31      1304
           2       0.28      0.29      0.29      1149

    accuracy                           0.33      4030
   macro avg       0.33      0.33      0.33      4030
weighted avg       0.33      0.33      0.33      4030

[[624 493 460]
 [545 395 364]
 [462 359 328]]


In [114]:
import pprint

In [116]:
pipeline_description = pprint.pformat(stacking_pipeline)

with open('pipeline_description.txt', 'w') as file:
    file.write(pipeline_description)

In [112]:
data_sample

Unnamed: 0,tweet,label
0,Sasha goes to Mannheim University,1
1,Danylo will create a great application,1
2,Danylo will not create a great application,0
3,Dasha didnt fulfill her task,0
4,"You suck a dick, nigga",1
5,fuck jews idk,1
6,you are a good person,0


In [109]:
stacking_pipeline.predict_proba(data_sample)

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.]])

In [60]:
l2_pipeline_1 = Pipeline([
    ('l1', l1),
    ('regressor', RandomForestClassifier())
])

l2_pipeline_2 = Pipeline([
    ('l1', l1),
    ('regressor', LogisticRegression())
])

In [61]:
l2_pipeline_1.fit(data_sample[['tweet']], data_sample['label'])
l2_pipeline_2.fit(data_sample[['tweet']], data_sample['label'])

In [142]:
l2_pipeline_1

In [65]:
eli5.explain_weights_df(l2_pipeline_1[-1], feature_names=l2_pipeline_2[:-1].get_feature_names_out())

Unnamed: 0,feature,weight,std
0,l1_g1_sentiment_score_of_row,0.664683,0.45259
1,l1_g1_number_of_words_in_row,0.335317,0.400713
2,l1_g1_number_of_full_stops,0.0,0.0
3,l1_g1_number_of_question_marks,0.0,0.0
4,l1_g1_number_of_excl_points,0.0,0.0
5,l1_g1_number_of_commas,0.0,0.0
6,l1_g1_contains_profanity_words_flag,0.0,0.0
7,l1_g1_rt_flag,0.0,0.0
8,l1_p4_RandomForestClassifier,0.0,0.0
9,l1_p3_CatBoostClassifier,0.0,0.0


In [138]:
meta_transformer = CustomColumnTransformer([
    ('l2_p1', PipelineTransformer(l2_pipeline_1), []),
    ('l2_p1', PipelineTransformer(l2_pipeline_2), []),
])

In [143]:
meta_transformer

In [139]:
meta_model = Pipeline(
    [
        ('l3_transformer', meta_transformer),
        ('l3_model', RandomForestClassifier()),
    ]
)

In [140]:
meta_model

In [132]:
l2_pipeline_1.predict(data_sample)

array([1, 1, 0, 0], dtype=int64)