In [1]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.pipeline import make_pipeline

from notebooks.Dzim.data_mining.validation.training import estimate_multiple_models, estimate_model

In [2]:
plt.rcParams['figure.facecolor'] = 'white'

In [3]:
%load_ext autoreload
%autoreload 2

#### Loading data

In [4]:
data = pd.read_csv('data1.csv', index_col=0)

In [5]:
# data.head(100)

In [6]:
data_sample = pd.DataFrame({
    'tweet': [
        'Sasha goes to Mannheim University',
        'Danylo will create a great application',
        'Danylo will not create a great application',
        'Dasha didnt fulfill her task'
    ],
    'class': [1, 1, 0, 0]
})

#### Preprocessing

In [7]:
from sklearn.pipeline import Pipeline
from notebooks.Dzim.web_mining.bert_processor import TextCleaner, SeriesConverter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from notebooks.Dzim.web_mining.feature_generation import FeatureGenerator

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [8]:
target = 'label'

In [9]:
# Split the data into train and test sets
data_train, data_test, y_train, y_test = train_test_split(data[['tweet']], data[target], test_size=0.2, random_state=42)

In [10]:
data_train.shape, data_test.shape

((25569, 1), (6393, 1))

In [11]:
# Define the pipeline
nlp = Pipeline([
    ("cleaning", TextCleaner()),
    ("series_converter", SeriesConverter()),
    ("vectorization", TfidfVectorizer(max_df=0.7, min_df=3)),
])

In [12]:
# Fit-transform the training data
X_train = nlp.fit_transform(data_train)  #.toarray()

In [13]:
# Transform the test data
X_test = nlp.transform(data_test)

In [14]:
# fg = FeatureGenerator()

In [15]:
# from sklearn.compose import ColumnTransformer
#
# preprocessor = ColumnTransformer(
#     transformers=[
#         ("nlp", nlp, ['tweet']),
#         ("feat_gen", FeatureGenerator(), ['tweet']),
#     ],
#     remainder='drop'
# )

In [16]:
# fg.transform(data_sample)

#### Features

#### Models

In [17]:
from sklearn.dummy import DummyClassifier

dummy_mean = make_pipeline(
    DummyClassifier(strategy='most_frequent'),
)
dummy_median = make_pipeline(
    DummyClassifier(strategy='stratified'),
)

In [18]:
from sklearn.linear_model import LogisticRegression

lr = make_pipeline(
    # SimpleImputer(strategy='median', fill_value=0),
    LogisticRegression(),
)

In [19]:
from catboost import CatBoostClassifier

from xgboost import XGBClassifier

from lightgbm import LGBMClassifier

from sklearn.ensemble import RandomForestClassifier

ctb = make_pipeline(
    CatBoostClassifier()
)
xgb = make_pipeline(
    XGBClassifier()
)
lgbm = make_pipeline(
    LGBMClassifier()
)

rfc_parameters = {
    'max_depth': None,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    'n_estimators': 100
}
rfc = make_pipeline(
    # SimpleImputer(missing_values=np.nan, strategy='mean'),
    RandomForestClassifier(**rfc_parameters),
)

In [310]:
model = rfc

In [305]:
lr.fit(X_train, y_train)

In [311]:
model.fit(X_train, y_train)

In [312]:
prediction = model.predict(X_test)

In [313]:
import warnings
from sklearn.metrics import classification_report, confusion_matrix

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(classification_report(prediction, y_test))
    print(confusion_matrix(prediction, y_test))

              precision    recall  f1-score   support

           0       0.99      0.96      0.98      6118
           1       0.52      0.86      0.65       275

    accuracy                           0.96      6393
   macro avg       0.76      0.91      0.81      6393
weighted avg       0.97      0.96      0.96      6393

[[5899  219]
 [  38  237]]


In [314]:
train_prediction = model.predict(X_train)

In [315]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(classification_report(train_prediction, y_train))
    print(confusion_matrix(train_prediction, y_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     23792
           1       0.99      1.00      1.00      1777

    accuracy                           1.00     25569
   macro avg       1.00      1.00      1.00     25569
weighted avg       1.00      1.00      1.00     25569

[[23782    10]
 [    1  1776]]


In [20]:
pipelines = [
    ("Dummy_mean", dummy_mean),
    ("Dummy_median", dummy_median),
    # ("LinReg", lr),
    # ("XGB", xgb),
    ("LGBM", lgbm),
    ("RFC", rfc),
    ("CatBoost", ctb),
]

#### Models

In [21]:
#### training
model_names, model_pipelines = zip(*pipelines)
metrics, weights = estimate_multiple_models(
    model_pipelines,
    lambda pipeline, _: estimate_model(
        pipeline,
        (X_train, X_test, y_train, y_test),
    ),
    model_names
)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Learning rate set to 0.041118
0:	learn: 0.6476334	total: 243ms	remaining: 4m 2s
1:	learn: 0.6072432	total: 348ms	remaining: 2m 53s
2:	learn: 0.5712074	total: 444ms	remaining: 2m 27s
3:	learn: 0.5370233	total: 542ms	remaining: 2m 15s
4:	learn: 0.5076169	total: 635ms	remaining: 2m 6s
5:	learn: 0.4808841	total: 718ms	remaining: 1m 59s
6:	learn: 0.4559781	total: 782ms	remaining: 1m 50s
7:	learn: 0.4341940	total: 853ms	remaining: 1m 45s
8:	learn: 0.4134218	total: 924ms	remaining: 1m 41s
9:	learn: 0.3941060	total: 988ms	remaining: 1m 37s
10:	learn: 0.3767878	total: 1.05s	remaining: 1m 34s
11:	learn: 0.3614723	total: 1.13s	remaining: 1m 32s
12:	learn: 0.3479418	total: 1.19s	remaining: 1m 30s
13:	learn: 0.3353971	total: 1.26s	remaining: 1m 28s
14:	learn: 0.3246373	total: 1.33s	remaining: 1m 27s
15:	learn: 0.3136440	total: 1.39s	remaining: 1m 25s
16:	learn: 0.3047822	total: 1.46s	remaining: 1m 24s
17:	learn: 0.2966454	total: 1.53s	remaining: 1m 23s
18:	learn: 0.2894830	total: 1.59s	remaining: 1

In [22]:
metrics

set,train,train,train,train,test,test,test,test
metric,f1_score,cohen_kappa_score,precision_score,recall_score,f1_score,cohen_kappa_score,precision_score,recall_score
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Dummy_mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Dummy_median,0.072,0.001,0.071,0.072,0.066,-0.004,0.068,0.064
LGBM,0.662,0.644,0.941,0.51,0.56,0.536,0.775,0.439
RFC,0.997,0.996,0.999,0.994,0.647,0.628,0.889,0.509
CatBoost,0.68,0.663,0.958,0.527,0.589,0.568,0.861,0.447


In [322]:
for name, _ in pipelines:
    try:
        print("#" * 80)
        print(name)
        print(weights.loc[name])
    except KeyError:
        print('Not Supported')

################################################################################
Dummy_mean
Not Supported
################################################################################
Dummy_median
Not Supported
################################################################################
LinReg
         weight
feature        
x155      6.861
x6788     6.015
x4997     5.476
x4998     5.027
x6853     4.392
...         ...
x4445    -2.665
x627     -3.027
x2765    -3.027
x3679    -3.296
x1523    -4.132

[6993 rows x 1 columns]
################################################################################
XGB
Not Supported
################################################################################
LGBM
             weight
feature            
Column_6788   0.056
Column_155    0.053
Column_6382   0.051
Column_4998   0.039
Column_4997   0.030
...             ...
Column_4579   0.000
Column_4580   0.000
Column_4581   0.000
Column_4582   0.000
Column_0      0.000

[6992 rows x 1 colu

In [27]:
final_pipeline_1 = Pipeline([
    ('NLP', nlp),
    ('regressor', rfc[-1])
])

final_pipeline_2 = Pipeline([
    ('NLP', nlp),
    ('regressor', lgbm[-1])
])

final_pipeline_3 = Pipeline([
    ('NLP', nlp),
    ('regressor', ctb[-1])
])

In [28]:
final_pipeline_1

In [29]:
final_pipeline_2.predict_proba(pd.DataFrame({
    'tweet': [
        'Sasha goes to Mannheim University',
        '@Danylo will create a great application',
        '@You bitch suck a dick fuck you shit in your ass',
        'He will break it',
    ]}))

array([[0.96401541, 0.03598459],
       [0.91287952, 0.08712048],
       [0.77486095, 0.22513905],
       [0.98496323, 0.01503677]])

In [37]:
from src.common.prediction_model.persistence import FsModelPersistence
from src.common.validation.metrics import aggregate_metrics
from src.common.prediction_model.prediction_model import ModelContainer, ModelMetadata

container = ModelContainer.create(
    pipeline_name='l1_data2_ctb',
    pipeline=final_pipeline_3,
    feature_names=[],
    metadata=ModelMetadata(
        model_name=str(final_pipeline_3.__class__.__name__),
        **aggregate_metrics(metrics)['test'].loc["CatBoost"]
    )
)

In [38]:
# %%
persistor = FsModelPersistence()
name = persistor.save(container)

In [39]:
name

'l1_data2_ctb_Pipeline_2023-05-16.bin.gz'

#### Hyperparameter optimization

In [343]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import time

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [20, 50, 100, 200],
    'max_depth': [None, 2, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Create a RandomForestClassifier object
model = RandomForestClassifier()

"""
Best parameters:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best accuracy:  0.9592866361609763
Computation tmie: 245.936190366745
"""

In [347]:
# Define the parameter grid to search over
param_grid = {
    'iterations': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5],
    'random_strength': [0.1, 0.5, 1],
}
model = CatBoostClassifier()
"""

"""

'\n\n'

In [349]:
param_grid = {
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'min_child_weight': [1, 3, 5],
}

model = XGBClassifier(tree_method='gpu_hist')

In [None]:
# Create a GridSearchCV object and fit the data
start = time.time()
grid_search = GridSearchCV(model, param_grid=param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and accuracy score
print("Best parameters: ", grid_search.best_params_)
print("Best accuracy: ", grid_search.best_score_)
end = time.time()
print(f"Computation tmie: {end - start}")