In [1]:
import pandas as pd
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from matplotlib import pyplot as plt
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from xgboost import XGBClassifier

from notebooks.Dzim.data_mining.validation.training import estimate_multiple_models, estimate_model

In [2]:
plt.rcParams['figure.facecolor'] = 'white'

In [3]:
%load_ext autoreload
%autoreload 2

#### Loading data

In [4]:
data = pd.read_csv('data3.csv')

In [5]:
data = data.loc[data['label'].isin(['hate', 'noHate'])]
data['label'] = (data['label'] == 'hate').astype(int)
data.columns = ['tweet', 'label']

In [6]:
data.head()

Unnamed: 0,tweet,label
0,"As of March 13th , 2014 , the booklet had been...",0
1,Thank you in advance. : ) Download the youtube...,0
2,In order to help increase the booklets downloa...,0
3,( Simply copy and paste the following text int...,0
4,Click below for a FREE download of a colorfull...,1


In [7]:
data.label.value_counts()

0    9507
1    1196
Name: label, dtype: int64

In [8]:
data_sample = pd.DataFrame({
    'tweet': [
        'Sasha goes to Mannheim University',
        'Danylo will create a great application',
        'Danylo will not create a great application',
        'Dasha didnt fulfill her task'
    ],
    'label': [1, 1, 0, 0]
})

#### Preprocessing

In [9]:
from sklearn.pipeline import Pipeline
from notebooks.Dzim.web_mining.bert_processor import TextCleaner, SeriesConverter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
target = 'label'

In [132]:
# Split the data into train and test sets
data_train, data_test, y_train, y_test = train_test_split(data[['tweet']], data[target], test_size=0.2, random_state=42, stratify=data[target])

In [133]:
data_train.shape, data_test.shape

((8562, 1), (2141, 1))

In [134]:
# Define the pipeline
nlp = Pipeline([
    ("cleaning", TextCleaner()),
    ("series_converter", SeriesConverter()),
    ("vectorization", TfidfVectorizer(max_df=0.5, min_df=3)),
])

In [135]:
# Fit-transform the training data
X_train = nlp.fit_transform(data_train)  #.toarray()

In [136]:
X_train.shape

(8562, 3240)

In [137]:
# Transform the test data
X_test = nlp.transform(data_test)

#### Models

In [138]:
dummy_mean = make_pipeline(
    DummyClassifier(strategy='most_frequent'),
)
dummy_median = make_pipeline(
    DummyClassifier(strategy='stratified'),
)

In [139]:
lr = make_pipeline(
    LogisticRegression(),
)

In [140]:
ctb = make_pipeline(
    CatBoostClassifier()
)
xgb_parameters = {
    'learning_rate': 0.01,
    'max_depth': 9,
    'min_child_weight': 3,
    'n_estimators': 300

}

xgb = make_pipeline(
    XGBClassifier(**xgb_parameters)
)

lgbm_parameters = {'boosting_type': 'dart', 'learning_rate': 0.1, 'n_estimators': 256, 'num_leaves': 20}

lgbm = make_pipeline(
    LGBMClassifier(**lgbm_parameters)
)

rfc_parameters = {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 256}

rfc = make_pipeline(
    RandomForestClassifier(**rfc_parameters),
)

In [141]:
rfc_parameters = {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 256}

rfc = make_pipeline(
    RandomForestClassifier(**rfc_parameters),
)

In [142]:
model = rfc

In [143]:
# lr.fit(X_train, y_train)

In [144]:
model.fit(X_train, y_train)

In [145]:
prediction = model.predict(X_test)

In [146]:
import warnings
from sklearn.metrics import classification_report, confusion_matrix

In [147]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(classification_report(prediction, y_test))
    print(confusion_matrix(prediction, y_test))

              precision    recall  f1-score   support

           0       0.99      0.90      0.94      2100
           1       0.12      0.68      0.20        41

    accuracy                           0.90      2141
   macro avg       0.56      0.79      0.57      2141
weighted avg       0.98      0.90      0.93      2141

[[1889  211]
 [  13   28]]


In [148]:
train_prediction = model.predict(X_train)

In [149]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(classification_report(train_prediction, y_train))
    print(confusion_matrix(train_prediction, y_train))

              precision    recall  f1-score   support

           0       1.00      0.92      0.96      8267
           1       0.29      0.94      0.44       295

    accuracy                           0.92      8562
   macro avg       0.64      0.93      0.70      8562
weighted avg       0.97      0.92      0.94      8562

[[7586  681]
 [  19  276]]


In [150]:
pipelines = [
    ("Dummy_mean", dummy_mean),
    ("Dummy_median", dummy_median),
    ("LinReg", lr),
    ("XGB", xgb),
    ("LGBM", lgbm),
    ("RFC", rfc),
    ("CatBoost", ctb),
]

#### Models

In [151]:

#### training
model_names, model_pipelines = zip(*pipelines)
metrics, weights = estimate_multiple_models(
    model_pipelines,
    lambda pipeline, _: estimate_model(
        pipeline,
        (X_train, X_test, y_train, y_test),
    ),
    model_names
)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Learning rate set to 0.025772
0:	learn: 0.6719091	total: 27.1ms	remaining: 27s
1:	learn: 0.6517708	total: 52.1ms	remaining: 26s
2:	learn: 0.6337328	total: 77ms	remaining: 25.6s
3:	learn: 0.6162053	total: 102ms	remaining: 25.3s
4:	learn: 0.6002484	total: 126ms	remaining: 25.1s
5:	learn: 0.5847897	total: 152ms	remaining: 25.2s
6:	learn: 0.5707623	total: 177ms	remaining: 25.1s
7:	learn: 0.5569675	total: 202ms	remaining: 25s
8:	learn: 0.5444003	total: 228ms	remaining: 25.1s
9:	learn: 0.5319046	total: 256ms	remaining: 25.4s
10:	learn: 0.5209678	total: 284ms	remaining: 25.6s
11:	learn: 0.5097912	total: 315ms	remaining: 25.9s
12:	learn: 0.4992775	total: 344ms	remaining: 26.1s
13:	learn: 0.4902764	total: 374ms	remaining: 26.4s
14:	learn: 0.4802049	total: 402ms	remaining: 26.4s
15:	learn: 0.4718727	total: 431ms	remaining: 26.5s
16:	learn: 0.4634352	total: 460ms	remaining: 26.6s
17:	learn: 0.4558712	total: 489ms	remaining: 26.7s
18:	learn: 0.4489125	total: 521ms	remaining: 26.9s
19:	learn: 0.442

In [152]:
metrics

set,train,train,train,train,test,test,test,test
metric,f1_score,cohen_kappa_score,precision_score,recall_score,f1_score,cohen_kappa_score,precision_score,recall_score
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Dummy_mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Dummy_median,0.101,-0.011,0.102,0.099,0.106,-0.004,0.108,0.105
LinReg,0.203,0.18,0.81,0.116,0.16,0.135,0.611,0.092
XGB,0.338,0.31,0.909,0.208,0.186,0.159,0.65,0.109
LGBM,0.407,0.373,0.85,0.268,0.262,0.225,0.606,0.167
RFC,0.451,0.419,0.917,0.299,0.191,0.162,0.614,0.113
CatBoost,0.466,0.435,0.955,0.308,0.248,0.214,0.627,0.155


In [84]:
metrics

set,train,train,train,train,test,test,test,test
metric,f1_score,cohen_kappa_score,precision_score,recall_score,f1_score,cohen_kappa_score,precision_score,recall_score
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Dummy_mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Dummy_median,0.124,0.021,0.13,0.119,0.096,-0.016,0.104,0.09
LinReg,0.206,0.184,0.822,0.118,0.14,0.118,0.667,0.078
XGB,0.337,0.308,0.879,0.208,0.189,0.16,0.667,0.11
LGBM,0.395,0.365,0.925,0.251,0.23,0.198,0.7,0.137
RFC,0.891,0.879,0.983,0.815,0.19,0.163,0.7,0.11
CatBoost,0.478,0.448,0.965,0.318,0.31,0.273,0.735,0.196


In [322]:
for name, _ in pipelines:
    try:
        print("#" * 80)
        print(name)
        print(weights.loc[name])
    except KeyError:
        print('Not Supported')

################################################################################
Dummy_mean
Not Supported
################################################################################
Dummy_median
Not Supported
################################################################################
LinReg
         weight
feature        
x155      6.861
x6788     6.015
x4997     5.476
x4998     5.027
x6853     4.392
...         ...
x4445    -2.665
x627     -3.027
x2765    -3.027
x3679    -3.296
x1523    -4.132

[6993 rows x 1 columns]
################################################################################
XGB
Not Supported
################################################################################
LGBM
             weight
feature            
Column_6788   0.056
Column_155    0.053
Column_6382   0.051
Column_4998   0.039
Column_4997   0.030
...             ...
Column_4579   0.000
Column_4580   0.000
Column_4581   0.000
Column_4582   0.000
Column_0      0.000

[6992 rows x 1 colu

In [42]:
final_pipeline_1 = Pipeline([
    ('NLP', nlp),
    ('regressor', rfc[-1])
])

final_pipeline_2 = Pipeline([
    ('NLP', nlp),
    ('regressor', lgbm[-1])
])

final_pipeline_3 = Pipeline([
    ('NLP', nlp),
    ('regressor', ctb[-1])
])

final_pipeline_4 = Pipeline([
    ('NLP', nlp),
    ('regressor', lr[-1])
])

In [43]:
final_pipeline_3

In [51]:
final_pipeline_3.predict_proba(pd.DataFrame({
    'tweet': [
        'Sasha goes to Mannheim University',
        '@Danylo will create a great application',
        '@You bitch suck a dick fuck you shit in your ass',
        'He will break it',
    ]}))

array([[0.95473859, 0.04526141],
       [0.92708275, 0.07291725],
       [0.92499595, 0.07500405],
       [0.89467997, 0.10532003]])

In [53]:
from src.common.validation.metrics import aggregate_metrics
from src.common.prediction_model.persistence import FsModelPersistence
from src.common.prediction_model.prediction_model import ModelContainer, ModelMetadata

container = ModelContainer.create(
    pipeline_name='l1_data3_ctb',
    pipeline=final_pipeline_3,
    feature_names=[],
    metadata=ModelMetadata(
        model_name=str(final_pipeline_4.__class__.__name__),
        **aggregate_metrics(metrics)['test'].loc["CatBoost"]
    )
)

In [54]:
# %%
persistor = FsModelPersistence()
name = persistor.save(container)

In [55]:
name

'l1_data3_ctb_Pipeline_2023-05-17.bin.gz'

#### Hyperparameter optimization

In [61]:
from sklearn.model_selection import GridSearchCV
import time


In [62]:
# Define the parameter grid to search over
param_grid = {
    'n_estimators': [128, 200, 256, 384],
    'max_depth': [None, 2, 3, 4],
    'min_samples_split': [10, 12, 16, 20],
    'min_samples_leaf': [1, 8, 16],
}

# Create a RandomForestClassifier object
model = RandomForestClassifier()

"""
Best parameters:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Best accuracy:  0.8954683485167018
Computation tmie: 49.068764209747314
"""

"\nBest parameters:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}\nBest accuracy:  0.8954683485167018\nComputation tmie: 49.068764209747314\n"

In [153]:
# Define the parameter grid to search over
param_grid = {
    'boosting_type': ['gbdt', 'dart', 'goss'],  # Different boosting types
    'num_leaves': [16, 20, 24],  # Maximum number of leaves in one tree
    'learning_rate': [0.1, 0.01],  # Learning rate for boosting
    'n_estimators': [128, 256, 384],  # Number of boosting iterations
}

model = LGBMClassifier(device="gpu")
"""
Best parameters:  {'boosting_type': 'dart', 'learning_rate': 0.1, 'n_estimators': 256, 'num_leaves': 20}
Best accuracy:  0.8944180292540278
Computation tmie: 394.68553471565247
"""

"\nBest parameters:  {'boosting_type': 'dart', 'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 20}\nBest accuracy:  0.8933660359729035\nComputation tmie: 375.2135384082794\n"

In [184]:
p = {
    'iterations': [128, 256, 384],  # Number of boosting iterations
    'learning_rate': [0.01, 0.1],  # Learning rate
    'depth': [4, 6, 8],  # Depth of the trees
    'l2_leaf_reg': [1, 2, 4],  # L2 regularization coefficient
    'border_count': [32, 64, 128],  # Number of splits for numerical features
    'eval_metric': ['Logloss', 'AUC'],  # Evaluation metric
    'random_seed': [42]  # Random seed for reproducibility
}


model = CatBoostClassifier(task_type='GPU')


"""
Best parameters:  {
    'iterations': 128,       # Number of boosting iterations
    'learning_rate': 0.1,    # Learning rate
    'depth': 6,              # Depth of the trees
    'l2_leaf_reg': 1,        # L2 regularization coefficient
    'border_count': 64,      # Number of splits for numerical features
    'eval_metric': 'Logloss',# Evaluation metric
    'random_seed': 42        # Random seed for reproducibility
}

Best accuracy:  0.8903293622985284
Computation tmie: 3519.5483028888702
"""

"\nBest parameters:  {'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 3, 'n_estimators': 300}\nBest accuracy:  0.8903293622985284\nComputation tmie: 519.5483028888702\n"

In [None]:
# Create a GridSearchCV object and fit the data
start = time.time()
grid_search = GridSearchCV(model, param_grid=p, cv=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and accuracy score
print("Best parameters: ", grid_search.best_params_)
print("Best accuracy: ", grid_search.best_score_)
end = time.time()
print(f"Computation tmie: {end - start}")

In [182]:
grid_search.best_params_

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'