In [5]:
import numpy as np
from sklearn.compose import ColumnTransformer
from transformers import Pipeline
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin


class PipelineTransformer(BaseEstimator, TransformerMixin):
    def __init__(
            self,
            pipeline: Pipeline,
            fitted: bool = True
    ):
        self.pipeline = pipeline
        self.fitted = fitted
        self.output_n_ = ['']

    def fit(self, X: pd.DataFrame = None, y=None):
        if not self.fitted:
            self.pipeline.fit(X, y)
        return self

    def transform(self, X):
        try:
            prediction = self.pipeline.predict_proba(X)
            if self.pipeline[-1].classes_.shape[0] >= 3:
                prediction = prediction
                self.output_n_ = [f'_{i}' for i in range(self.pipeline[-1].classes_.shape[0])]
            else:
                prediction = prediction[:, 0]
        except AttributeError:
            prediction = self.pipeline.predict(X)
        return prediction

    def __sklearn_is_fitted__(self):
        return True

    def get_feature_names_out(self, input_features=None):
        return [type(self.pipeline[-1]).__name__ + f for f in self.output_n_]  #+ str(i) for i in self.pipeline[-1].classes_


class PipelinePredictor(BaseEstimator, TransformerMixin):
    def __init__(
            self,
            pipelines,
            model_names: list = None,
    ):
        self.pipelines = pipelines
        if model_names is None:
            self.model_names = [type(pipe[-1]).__name__ for pipe in pipelines]
        else:
            self.model_names = model_names

    def fit(self, X: pd.DataFrame, y=None):
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame or np.array:
        predictions = pd.DataFrame()
        for pipeline, name in zip(self.pipelines, self.model_names):
            predictions[name] = pipeline.predict(X)

        return predictions


class CustomColumnTransformer(ColumnTransformer):
    def fit(self, X=None, y=None):
        return self

    def fit_transform(self, X=None, y=None):
        return self.transform(X)

    def transform(self, X):
        output = []
        self.feature_names_out = []

        for name, transformer, _ in self.transformers:
            transformer_output = transformer.transform(X)
            output.append(pd.DataFrame(transformer_output))

            try:
                feature_names = transformer.get_feature_names_out()
            except AttributeError:
                feature_names = transformer_output.columns

            self.feature_names_out += (name + '_' + feat for feat in feature_names)

        return np.concatenate(output, axis=1)

    def get_feature_names_out(self, input_features=None):
        return self.feature_names_out


In [9]:
import pickle
import gzip


def read_gzip_model(file):
    with gzip.open(file, mode="rb") as f:
        return pickle.load(f)

m = read_gzip_model('stacking_pipeline_Pipeline_2023-05-23.bin.gz')

In [17]:
stacking_model = m.pipeline

In [18]:
stacking_model

In [37]:
data_sample = pd.DataFrame({
    'tweet': [
        'Sasha goes to Mannheim University',
        'Danylo will create a great application',
        'Danylo will not create a great application',
        'Dasha fulfilled her task',
        'You suck a dick, nigga',
        'fuck jews idk',
        'you are a good person',
    ],
})

In [38]:
stacking_model.predict(data_sample)

array([2, 2, 2, 0, 1, 1, 0], dtype=int64)

#### Predict on processed data

In [27]:
X_test = pd.read_csv('data5_processed_test.csv', index_col=0)
y_test = pd.read_csv('y5_processed_test.csv', index_col=0)

In [14]:
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,0.859649,0.900000,0.567415,1.661460,0.077533,0.803768,0.118698,0.101152,0.575513,0.323335,0.940367,0.0,1.0,-0.5038,49.0,0.0,0.0,0.0,0.0
1,0.628969,0.260000,0.584839,2.824825,0.134313,0.716365,0.149322,0.101616,0.878575,0.019809,0.898439,0.0,1.0,-0.8493,34.0,0.0,0.0,0.0,0.0
2,0.176637,0.310000,1.047882,0.558034,0.727103,0.217169,0.055727,0.351215,0.413401,0.235384,0.898149,0.0,0.0,-0.2500,9.0,0.0,0.0,0.0,0.0
3,0.045129,0.390000,1.525594,1.066207,0.464283,0.489319,0.046398,0.683845,0.296248,0.019907,0.693979,0.0,1.0,-0.8169,27.0,0.0,0.0,0.0,0.0
4,0.679891,0.390000,1.526451,1.174880,0.209937,0.516017,0.274046,0.433859,0.373073,0.193068,0.363039,0.0,1.0,-0.7845,51.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4025,0.473810,0.510000,0.616623,0.874583,0.220002,0.358298,0.421701,0.178223,0.318875,0.502902,0.874339,0.0,1.0,-0.9416,44.0,0.0,0.0,0.0,0.0
4026,0.972618,0.970000,0.120703,2.874699,0.034653,0.898249,0.067099,0.015709,0.966276,0.018014,0.941315,0.0,1.0,0.4215,28.0,0.0,0.0,0.0,0.0
4027,0.572760,0.637593,0.782072,0.874583,0.418352,0.462580,0.119068,0.127773,0.407215,0.465012,0.892215,0.0,0.0,-0.8176,13.0,0.0,0.0,0.0,0.0
4028,0.920090,0.980000,1.624888,1.682285,0.675868,0.310882,0.013251,0.661903,0.331909,0.006188,0.918714,0.0,1.0,-0.9612,47.0,0.0,0.0,0.0,0.0


In [19]:
stacking_model[-1].predict(X_test)



array([1, 1, 0, ..., 0, 1, 0], dtype=int64)

In [29]:
y_test.columns

Index(['label'], dtype='object')

In [30]:
type(y_test['label'])

pandas.core.series.Series

In [31]:
y_test['label']

Unnamed: 0.1
2879     1
18440    2
14023    1
9668     1
17813    1
        ..
12164    1
17108    2
14976    2
2209     1
16380    0
Name: label, Length: 4030, dtype: int64

In [34]:
from sklearn.metrics import classification_report

print(classification_report(stacking_model[-1].predict(X_test), y_test))

              precision    recall  f1-score   support

           0       0.48      0.50      0.49      1540
           1       0.49      0.48      0.48      1283
           2       0.37      0.36      0.36      1207

    accuracy                           0.45      4030
   macro avg       0.45      0.45      0.45      4030
weighted avg       0.45      0.45      0.45      4030



