In [9]:
from imblearn import FunctionSampler
from imblearn.pipeline import Pipeline
# from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, accuracy_score, balanced_accuracy_score, matthews_corrcoef
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, KBinsDiscretizer, OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, chi2, f_classif
from sklearn.model_selection import RandomizedSearchCV
# from sklearn.svm import LinearSVC
from sklearn.linear_model import PassiveAggressiveClassifier, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.ensemble import RandomForestClassifier
# from pxtextmining.helpers.decode_emojis import text_preprocessor
from pxtextmining.helpers.sentiment_scores import sentiment_scores
from pxtextmining.helpers.text_length import text_length
from pxtextmining.helpers.tokenization import LemmaTokenizer
from pxtextmining.helpers.word_vectorization import EmbeddingsTransformer
from pxtextmining.helpers.oversampling import random_over_sampler_data_generator
from pxtextmining.helpers.metrics import class_balance_accuracy_score
from pxtextmining.helpers.estimator_switcher import ClfSwitcher
from pxtextmining.helpers.ordinal_classification import OrdinalClassifier
from pxtextmining.helpers.scaler_switcher import ScalerSwitcher
from pxtextmining.helpers.feature_selection_switcher import FeatureSelectionSwitcher
from pxtextmining.helpers.text_transformer_switcher import TextTransformerSwitcher
from pxtextmining.helpers.theme_binarization import ThemeBinarizer

In [2]:
import pandas as pd

from pxtextmining.factories.factory_data_load_and_split import factory_data_load_and_split


In [4]:
x_train, x_test, y_train, y_test, index_training_data, index_test_data = \
        factory_data_load_and_split(filename='../datasets/text_data.csv', target="criticality", predictor="feedback",
                                 test_size=0.33, reduce_criticality=True,
                                 theme="label")

Loading dataset...
Shape of dataset before cleaning is (10334, 3)
Shape of dataset after cleaning and processing is (10298, 10)
Preparing training and test sets...
Done


In [5]:
x_train

Unnamed: 0,predictor,theme,text_length,text_blob_polarity,text_blob_subjectivity,vader_neg,vader_neu,vader_pos,vader_compound
8310,excellent care,Care received,2,1.000000,1.000000,0.0,0.000,1.000,0.7845
4947,the administrators have shown such kindness to...,Staff,8,0.000000,0.500000,0.0,0.700,0.300,0.4588
8378,friendly professional as you would except exce...,Staff,8,0.491667,0.533333,0.0,0.451,0.549,0.8074
1292,fantastic support care by all staff,Care received,7,0.400000,0.900000,0.0,0.240,0.760,0.8591
7509,thank you to all the wonderful staff who cared...,Staff,11,1.000000,1.000000,0.0,0.471,0.529,0.8402
...,...,...,...,...,...,...,...,...,...
3748,I was very pleased with the services provided ...,Care received,11,0.650000,1.000000,0.0,0.758,0.242,0.4927
9317,nothing,Couldn't be improved,1,0.000000,0.000000,0.0,1.000,0.000,0.0000
1965,make sure that I knew what the vaccine helped ...,Communication,10,0.500000,0.888889,0.0,0.796,0.204,0.3182
4966,the care has been first class everything was d...,Care received,15,0.312500,0.333333,0.0,0.700,0.300,0.7177


In [13]:
from sklearn import set_config; set_config(display='diagram')
features_text = 'predictor'
# Define transformers for pipeline #
# Transformer that calculates text length and transforms it.
transformer_text_length = Pipeline(steps=[
    ('length', (FunctionTransformer(text_length))),
    ('scaler', (ScalerSwitcher()))
])

# Transformer that calculates sentiment indicators (e.g. TextBlob, VADER) and transforms them.
transformer_sentiment = Pipeline(steps=[
    ('sentiment', (FunctionTransformer(sentiment_scores))),
    ('scaler', (ScalerSwitcher()))
])

# Transformer that converts text to Bag-of-Words or embeddings.
transformer_text = Pipeline(steps=[
    ('text', (TextTransformerSwitcher()))
])

# Gather transformers.s
preprocessor = ColumnTransformer(
    transformers=[
        ('sentimenttr', transformer_sentiment, features_text),
        ('lengthtr', transformer_text_length, features_text),
        ('texttr', transformer_text, features_text)])

# Up-sampling step #
oversampler = FunctionSampler(func=random_over_sampler_data_generator,
                              kw_args={'threshold': 200,
                                       'up_balancing_counts': 300,
                                       'random_state': 0},
                              validate=False)

In [14]:
transformer_text_length

In [15]:
transformer_sentiment

In [16]:
transformer_text

In [17]:
preprocessor

In [None]:
# num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
# cat_transformer = OneHotEncoder()

# preproc_basic = make_column_transformer(
#     (num_transformer, ['age', 'bmi']),
#     (cat_transformer, ['smoker', 'region']),
#     remainder='passthrough'
# )

# preproc_full = make_union(preproc_basic, bmi_age_ratio_constructor)

# preproc_full