In [1]:
import pandas as pd
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from matplotlib import pyplot as plt
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from xgboost import XGBClassifier

from notebooks.Dzim.data_mining.validation.training import estimate_multiple_models, estimate_model

In [2]:
plt.rcParams['figure.facecolor'] = 'white'

In [3]:
%load_ext autoreload
%autoreload 2

#### Loading data

In [4]:
data = pd.read_csv('data3.csv')

In [5]:
data = data.loc[data['label'].isin(['hate', 'noHate'])]
data['label'] = (data['label'] == 'hate').astype(int)
data.columns = ['tweet', 'label']

In [6]:
data.head()

Unnamed: 0,tweet,label
0,"As of March 13th , 2014 , the booklet had been...",0
1,Thank you in advance. : ) Download the youtube...,0
2,In order to help increase the booklets downloa...,0
3,( Simply copy and paste the following text int...,0
4,Click below for a FREE download of a colorfull...,1


In [7]:
data.label.value_counts()

0    9507
1    1196
Name: label, dtype: int64

In [8]:
data_sample = pd.DataFrame({
    'tweet': [
        'Sasha goes to Mannheim University',
        'Danylo will create a great application',
        'Danylo will not create a great application',
        'Dasha didnt fulfill her task'
    ],
    'label': [1, 1, 0, 0]
})

#### Preprocessing

In [9]:
from sklearn.pipeline import Pipeline
from notebooks.Dzim.web_mining.bert_processor import TextCleaner, SeriesConverter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
target = 'label'

In [11]:
# Split the data into train and test sets
data_train, data_test, y_train, y_test = train_test_split(data[['tweet']], data[target], test_size=0.2, random_state=42)

In [12]:
data_train.shape, data_test.shape

((8562, 1), (2141, 1))

In [13]:
# Define the pipeline
nlp = Pipeline([
    ("cleaning", TextCleaner()),
    ("series_converter", SeriesConverter()),
    ("vectorization", TfidfVectorizer(max_df=0.7, min_df=3)),
])

In [14]:
# Fit-transform the training data
X_train = nlp.fit_transform(data_train)  #.toarray()

In [15]:
# Transform the test data
X_test = nlp.transform(data_test)

#### Models

In [27]:
dummy_mean = make_pipeline(
    DummyClassifier(strategy='most_frequent'),
)
dummy_median = make_pipeline(
    DummyClassifier(strategy='stratified'),
)

In [28]:
lr = make_pipeline(
    LogisticRegression(),
)

In [80]:
ctb = make_pipeline(
    CatBoostClassifier()
)
xgb_parameters = {
    'learning_rate': 0.01,
    'max_depth': 9,
    'min_child_weight': 3,
    'n_estimators': 300

}

xgb = make_pipeline(
    XGBClassifier(**xgb_parameters)
)

lgbm_parameters = {
    'boosting_type': 'dart',
    'learning_rate': 0.1,
    'n_estimators': 100,
    'num_leaves': 20
}

lgbm = make_pipeline(
    LGBMClassifier(**lgbm_parameters)
)

rfc_parameters = {
    'max_depth': None,
    'min_samples_leaf': 1,
    'min_samples_split': 10,
    'n_estimators': 300
}

rfc = make_pipeline(
    RandomForestClassifier(**rfc_parameters),
)

In [33]:
model = ctb

In [34]:
# lr.fit(X_train, y_train)

In [35]:
model.fit(X_train, y_train)

In [36]:
prediction = model.predict(X_test)

In [39]:
import warnings
from sklearn.metrics import classification_report, confusion_matrix

In [40]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(classification_report(prediction, y_test))
    print(confusion_matrix(prediction, y_test))

              precision    recall  f1-score   support

           0       0.99      0.90      0.94      2073
           1       0.20      0.74      0.31        68

    accuracy                           0.90      2141
   macro avg       0.59      0.82      0.63      2141
weighted avg       0.97      0.90      0.92      2141

[[1868  205]
 [  18   50]]


In [37]:
train_prediction = model.predict(X_train)

In [41]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(classification_report(train_prediction, y_train))
    print(confusion_matrix(train_prediction, y_train))

              precision    recall  f1-score   support

           0       1.00      0.92      0.96      8252
           1       0.32      0.96      0.48       310

    accuracy                           0.92      8562
   macro avg       0.66      0.94      0.72      8562
weighted avg       0.97      0.92      0.94      8562

[[7610  642]
 [  11  299]]


In [81]:
pipelines = [
    ("Dummy_mean", dummy_mean),
    ("Dummy_median", dummy_median),
    ("LinReg", lr),
    ("XGB", xgb),
    ("LGBM", lgbm),
    ("RFC", rfc),
    ("CatBoost", ctb),
]

#### Models

In [82]:

#### training
model_names, model_pipelines = zip(*pipelines)
metrics, weights = estimate_multiple_models(
    model_pipelines,
    lambda pipeline, _: estimate_model(
        pipeline,
        (X_train, X_test, y_train, y_test),
    ),
    model_names
)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Learning rate set to 0.025772
0:	learn: 0.6722376	total: 29.3ms	remaining: 29.3s
1:	learn: 0.6526178	total: 57.3ms	remaining: 28.6s
2:	learn: 0.6332825	total: 84.5ms	remaining: 28.1s
3:	learn: 0.6153812	total: 112ms	remaining: 28s
4:	learn: 0.5992390	total: 141ms	remaining: 28.2s
5:	learn: 0.5829984	total: 170ms	remaining: 28.2s
6:	learn: 0.5688213	total: 198ms	remaining: 28.1s
7:	learn: 0.5553256	total: 226ms	remaining: 28s
8:	learn: 0.5427053	total: 252ms	remaining: 27.8s
9:	learn: 0.5309441	total: 281ms	remaining: 27.8s
10:	learn: 0.5198678	total: 308ms	remaining: 27.7s
11:	learn: 0.5081727	total: 343ms	remaining: 28.3s
12:	learn: 0.4977785	total: 377ms	remaining: 28.6s
13:	learn: 0.4882694	total: 407ms	remaining: 28.7s
14:	learn: 0.4796549	total: 439ms	remaining: 28.8s
15:	learn: 0.4714581	total: 472ms	remaining: 29s
16:	learn: 0.4627935	total: 505ms	remaining: 29.2s
17:	learn: 0.4551046	total: 537ms	remaining: 29.3s
18:	learn: 0.4479481	total: 567ms	remaining: 29.3s
19:	learn: 0.4

In [84]:
metrics

set,train,train,train,train,test,test,test,test
metric,f1_score,cohen_kappa_score,precision_score,recall_score,f1_score,cohen_kappa_score,precision_score,recall_score
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Dummy_mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Dummy_median,0.124,0.021,0.13,0.119,0.096,-0.016,0.104,0.09
LinReg,0.206,0.184,0.822,0.118,0.14,0.118,0.667,0.078
XGB,0.337,0.308,0.879,0.208,0.189,0.16,0.667,0.11
LGBM,0.395,0.365,0.925,0.251,0.23,0.198,0.7,0.137
RFC,0.891,0.879,0.983,0.815,0.19,0.163,0.7,0.11
CatBoost,0.478,0.448,0.965,0.318,0.31,0.273,0.735,0.196


In [322]:
for name, _ in pipelines:
    try:
        print("#" * 80)
        print(name)
        print(weights.loc[name])
    except KeyError:
        print('Not Supported')

################################################################################
Dummy_mean
Not Supported
################################################################################
Dummy_median
Not Supported
################################################################################
LinReg
         weight
feature        
x155      6.861
x6788     6.015
x4997     5.476
x4998     5.027
x6853     4.392
...         ...
x4445    -2.665
x627     -3.027
x2765    -3.027
x3679    -3.296
x1523    -4.132

[6993 rows x 1 columns]
################################################################################
XGB
Not Supported
################################################################################
LGBM
             weight
feature            
Column_6788   0.056
Column_155    0.053
Column_6382   0.051
Column_4998   0.039
Column_4997   0.030
...             ...
Column_4579   0.000
Column_4580   0.000
Column_4581   0.000
Column_4582   0.000
Column_0      0.000

[6992 rows x 1 colu

In [42]:
final_pipeline_1 = Pipeline([
    ('NLP', nlp),
    ('regressor', rfc[-1])
])

final_pipeline_2 = Pipeline([
    ('NLP', nlp),
    ('regressor', lgbm[-1])
])

final_pipeline_3 = Pipeline([
    ('NLP', nlp),
    ('regressor', ctb[-1])
])

final_pipeline_4 = Pipeline([
    ('NLP', nlp),
    ('regressor', lr[-1])
])

In [43]:
final_pipeline_3

In [51]:
final_pipeline_3.predict_proba(pd.DataFrame({
    'tweet': [
        'Sasha goes to Mannheim University',
        '@Danylo will create a great application',
        '@You bitch suck a dick fuck you shit in your ass',
        'He will break it',
    ]}))

array([[0.95473859, 0.04526141],
       [0.92708275, 0.07291725],
       [0.92499595, 0.07500405],
       [0.89467997, 0.10532003]])

In [53]:
from src.common.validation.metrics import aggregate_metrics
from src.common.prediction_model.persistence import FsModelPersistence
from src.common.prediction_model.prediction_model import ModelContainer, ModelMetadata

container = ModelContainer.create(
    pipeline_name='l1_data3_ctb',
    pipeline=final_pipeline_3,
    feature_names=[],
    metadata=ModelMetadata(
        model_name=str(final_pipeline_4.__class__.__name__),
        **aggregate_metrics(metrics)['test'].loc["CatBoost"]
    )
)

In [54]:
# %%
persistor = FsModelPersistence()
name = persistor.save(container)

In [55]:
name

'l1_data3_ctb_Pipeline_2023-05-17.bin.gz'

#### Hyperparameter optimization

In [18]:
from sklearn.model_selection import GridSearchCV
import time


In [21]:

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [20, 50, 100, 200],
    'max_depth': [None, 2, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Create a RandomForestClassifier object
model = RandomForestClassifier()

"""
Best parameters:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Best accuracy:  0.8954683485167018
Computation tmie: 49.068764209747314
"""

"\nBest parameters:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}\nBest accuracy:  0.8954683485167018\nComputation tmie: 49.068764209747314\n"

In [21]:
# Define the parameter grid to search over
param_grid = {
    'boosting_type': ['gbdt', 'dart', 'goss'],  # Different boosting types
    'num_leaves': [20, 30, 40],  # Maximum number of leaves in one tree
    'learning_rate': [0.1, 0.01, 0.001],  # Learning rate for boosting
    'n_estimators': [100, 200, 300],  # Number of boosting iterations
    # 'subsample': [0.8, 0.9, 1.0],  # Subsample ratio of the training instances
    # 'colsample_bytree': [0.8, 0.9, 1.0],  # Subsample ratio of columns when constructing each tree
    # 'reg_alpha': [0.0, 0.1, 0.5],  # L1 regularization term on weights
    # 'reg_lambda': [0.0, 0.1, 0.5],  # L2 regularization term on weights
    # 'min_child_samples': [20, 50, 100]  # Minimum number of data needed in a child (leaf)
}

model = LGBMClassifier(device="gpu")
"""
Best parameters:  {'boosting_type': 'dart', 'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 20}
Best accuracy:  0.8933660359729035
Computation tmie: 375.2135384082794
"""

'\n\n'

In [22]:
# Create a GridSearchCV object and fit the data
start = time.time()
grid_search = GridSearchCV(model, param_grid=param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and accuracy score
print("Best parameters: ", grid_search.best_params_)
print("Best accuracy: ", grid_search.best_score_)
end = time.time()
print(f"Computation tmie: {end - start}")

Best parameters:  {'boosting_type': 'dart', 'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 20}
Best accuracy:  0.8933660359729035
Computation tmie: 375.2135384082794


In [16]:
param_grid = {
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'min_child_weight': [1, 3, 5],
}

model = XGBClassifier(tree_method='gpu_hist')


"""
Best parameters:  {'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 3, 'n_estimators': 300}
Best accuracy:  0.8903293622985284
Computation tmie: 519.5483028888702
"""