In [26]:
import pandas as pd
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from matplotlib import pyplot as plt
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from xgboost import XGBClassifier

from notebooks.Dzim.data_mining.validation.training import estimate_multiple_models, estimate_model

import warnings
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
plt.rcParams['figure.facecolor'] = 'white'

In [23]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Loading data

In [4]:
data = pd.read_csv('data2.csv', index_col=0)

In [5]:
data.head()

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [6]:
data['class'].value_counts()

1    19190
2     4163
0     1430
Name: class, dtype: int64

In [7]:
data_sample = pd.DataFrame({
    'tweet': [
        'Sasha goes to Mannheim University',
        'Danylo will create a great application',
        'Danylo will not create a great application',
        'Dasha didnt fulfill her task'
    ],
    'label': [1, 1, 0, 0]
})

#### Preprocessing

In [8]:
from sklearn.pipeline import Pipeline
from notebooks.Dzim.web_mining.bert_processor import TextCleaner, SeriesConverter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
target = 'class'

In [10]:
# Split the data into train and test sets
data_train, data_test, y_train, y_test = train_test_split(data[['tweet']], data[target], test_size=0.2, random_state=42)

In [11]:
data_train.shape, data_test.shape

((19826, 1), (4957, 1))

In [12]:
# Define the pipeline
nlp = Pipeline([
    ("cleaning", TextCleaner()),
    ("series_converter", SeriesConverter()),
    ("vectorization", TfidfVectorizer(max_df=0.7, min_df=3)),
])

In [13]:
# Fit-transform the training data
X_train = nlp.fit_transform(data_train)  #.toarray()

In [14]:
# Transform the test data
X_test = nlp.transform(data_test)

#### Models

In [15]:
dummy_mean = make_pipeline(
    DummyClassifier(strategy='most_frequent'),
)
dummy_median = make_pipeline(
    DummyClassifier(strategy='stratified'),
)

In [16]:
lr = make_pipeline(
    LogisticRegression(),
)

In [17]:
ctb = make_pipeline(
    CatBoostClassifier()
)
xgb_parameters = {
    # 'learning_rate': 0.01,
    # 'max_depth': 9,
    # 'min_child_weight': 3,
    # 'n_estimators': 300

}

xgb = make_pipeline(
    XGBClassifier(**xgb_parameters)
)

lgbm_parameters = {
    # 'boosting_type': 'dart',
    # 'learning_rate': 0.1,
    # 'n_estimators': 100,
    # 'num_leaves': 20
}

lgbm = make_pipeline(
    LGBMClassifier(**lgbm_parameters)
)

rfc_parameters = {
    # 'max_depth': None,
    # 'min_samples_leaf': 1,
    # 'min_samples_split': 10,
    # 'n_estimators': 300
}

rfc = make_pipeline(
    RandomForestClassifier(**rfc_parameters),
)

#### Train single model

In [18]:
# model = ctb

In [19]:
# lr.fit(X_train, y_train)

In [20]:
# model.fit(X_train, y_train)

In [21]:
# prediction = model.predict(X_test)

In [50]:
# with warnings.catch_warnings():
#     warnings.simplefilter("ignore")
#     print(classification_report(prediction, y_test))
#     print(confusion_matrix(prediction, y_test))

In [51]:
# train_prediction = model.predict(X_train)

In [52]:
# with warnings.catch_warnings():
#     warnings.simplefilter("ignore")
#     print(classification_report(train_prediction, y_train))
#     print(confusion_matrix(train_prediction, y_train))

In [65]:
from sklearn.metrics import f1_score

print(classification_report([1, 2, 3], [1, 2, 2]))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00         1
           2       0.50      1.00      0.67         1
           3       0.00      0.00      0.00         1

    accuracy                           0.67         3
   macro avg       0.50      0.67      0.56         3
weighted avg       0.50      0.67      0.56         3



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
pipelines = [
    ("Dummy_mean", dummy_mean),
    ("Dummy_median", dummy_median),
    ("LinReg", lr),
    ("XGB", xgb),
    ("LGBM", lgbm),
    ("RFC", rfc),
    ("CatBoost", ctb),
]

#### Models

In [30]:
#### training
model_names, model_pipelines = zip(*pipelines)
metrics, weights = estimate_multiple_models(
    model_pipelines,
    lambda pipeline, _: estimate_model(
        pipeline,
        (X_train, X_test, y_train, y_test),
    ),
    model_names
)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Learning rate set to 0.092147
0:	learn: 0.9865924	total: 269ms	remaining: 4m 28s
1:	learn: 0.9011843	total: 393ms	remaining: 3m 16s
2:	learn: 0.8335079	total: 513ms	remaining: 2m 50s
3:	learn: 0.7765817	total: 639ms	remaining: 2m 39s
4:	learn: 0.7295698	total: 767ms	remaining: 2m 32s
5:	learn: 0.6884794	total: 892ms	remaining: 2m 27s
6:	learn: 0.6538000	total: 1.02s	remaining: 2m 24s
7:	learn: 0.6236899	total: 1.14s	remaining: 2m 21s
8:	learn: 0.5968766	total: 1.27s	remaining: 2m 19s
9:	learn: 0.5753602	total: 1.4s	remaining: 2m 18s
10:	learn: 0.5558522	total: 1.52s	remaining: 2m 16s
11:	learn: 0.5360669	total: 1.65s	remaining: 2m 15s
12:	learn: 0.5198692	total: 1.78s	remaining: 2m 14s
13:	learn: 0.5026694	total: 1.91s	remaining: 2m 14s
14:	learn: 0.4917405	total: 2.03s	remaining: 2m 13s
15:	learn: 0.4787190	total: 2.16s	remaining: 2m 12s
16:	learn: 0.4678481	total: 2.28s	remaining: 2m 12s
17:	learn: 0.4586968	total: 2.41s	remaining: 2m 11s
18:	learn: 0.4514327	total: 2.52s	remaining: 

In [31]:
metrics

set,train,train,train,train,test,test,test,test
metric,f1_score,cohen_kappa_score,precision_score,recall_score,f1_score,cohen_kappa_score,precision_score,recall_score
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Dummy_mean,0.676,0.0,0.6,0.775,0.674,0.0,0.598,0.773
Dummy_median,0.632,-0.002,0.631,0.633,0.63,-0.004,0.628,0.632
LinReg,0.919,0.794,0.924,0.929,0.879,0.684,0.877,0.892
XGB,0.936,0.834,0.941,0.939,0.891,0.725,0.89,0.899
LGBM,0.939,0.841,0.942,0.943,0.889,0.719,0.887,0.898
RFC,0.998,0.994,0.998,0.998,0.875,0.673,0.873,0.888
CatBoost,0.905,0.761,0.91,0.913,0.891,0.727,0.891,0.9


In [32]:
for name, _ in pipelines:
    try:
        print("#" * 80)
        print(name)
        print(weights.loc[name])
    except KeyError:
        print('Not Supported')

################################################################################
Dummy_mean
Not Supported
################################################################################
Dummy_median
Not Supported
################################################################################
LinReg
         weight
feature        
x1449     4.976
x2929     4.571
x2934     4.413
x4700     3.834
x1666     3.706
...         ...
x1449    -5.437
x1666    -6.237
x3397    -6.653
x2000    -7.064
x401    -14.010

[14565 rows x 1 columns]
################################################################################
XGB
Not Supported
################################################################################
LGBM
             weight
feature            
Column_401    0.236
Column_2000   0.135
Column_3397   0.110
Column_1449   0.049
Column_2934   0.034
...             ...
Column_3101   0.000
Column_3100   0.000
Column_3099   0.000
Column_3098   0.000
Column_0      0.000

[4854 rows x 1 col

In [47]:
final_pipeline_1 = Pipeline([
    ('NLP', nlp),
    ('regressor', rfc[-1])
])

final_pipeline_2 = Pipeline([
    ('NLP', nlp),
    ('regressor', lgbm[-1])
])

final_pipeline_3 = Pipeline([
    ('NLP', nlp),
    ('regressor', ctb[-1])
])

final_pipeline_4 = Pipeline([
    ('NLP', nlp),
    ('regressor', lr[-1])
])

final_pipeline_5 = Pipeline([
    ('NLP', nlp),
    ('regressor', xgb[-1])
])

In [34]:
final_pipeline_3

In [35]:
final_pipeline_3.predict_proba(pd.DataFrame({
    'tweet': [
        'Sasha goes to Mannheim University',
        '@Danylo will create a great application',
        '@You bitch suck a dick fuck you shit in your ass',
        'He will break it',
    ]}))

array([[5.66561594e-02, 3.89024833e-01, 5.54319008e-01],
       [5.79087915e-02, 3.82691410e-01, 5.59399798e-01],
       [1.82251626e-02, 9.81689059e-01, 8.57787670e-05],
       [7.37499589e-02, 3.31598735e-01, 5.94651306e-01]])

In [51]:
from src.common.validation.metrics import aggregate_metrics
from src.common.prediction_model.persistence import FsModelPersistence
from src.common.prediction_model.prediction_model import ModelContainer, ModelMetadata

container = ModelContainer.create(
    pipeline_name='l1_data2_lgbm_clf',
    pipeline=final_pipeline_2,
    feature_names=[],
    metadata=ModelMetadata(
        model_name=str(final_pipeline_2.__class__.__name__),
        **aggregate_metrics(metrics)['test'].loc["LGBM"]
    )
)

In [52]:
# %%
persistor = FsModelPersistence()
name = persistor.save(container)

In [53]:
name

'l1_data2_lgbm_clf_Pipeline_2023-05-23.bin.gz'

#### Hyperparameter optimization

In [18]:
from sklearn.model_selection import GridSearchCV
import time


In [21]:

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [20, 50, 100, 200],
    'max_depth': [None, 2, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Create a RandomForestClassifier object
model = RandomForestClassifier()

"""
Best parameters:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Best accuracy:  0.8954683485167018
Computation tmie: 49.068764209747314
"""

"\nBest parameters:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}\nBest accuracy:  0.8954683485167018\nComputation tmie: 49.068764209747314\n"

In [21]:
# Define the parameter grid to search over
param_grid = {
    'boosting_type': ['gbdt', 'dart', 'goss'],  # Different boosting types
    'num_leaves': [20, 30, 40],  # Maximum number of leaves in one tree
    'learning_rate': [0.1, 0.01, 0.001],  # Learning rate for boosting
    'n_estimators': [100, 200, 300],  # Number of boosting iterations
    # 'subsample': [0.8, 0.9, 1.0],  # Subsample ratio of the training instances
    # 'colsample_bytree': [0.8, 0.9, 1.0],  # Subsample ratio of columns when constructing each tree
    # 'reg_alpha': [0.0, 0.1, 0.5],  # L1 regularization term on weights
    # 'reg_lambda': [0.0, 0.1, 0.5],  # L2 regularization term on weights
    # 'min_child_samples': [20, 50, 100]  # Minimum number of data needed in a child (leaf)
}

model = LGBMClassifier(device="gpu")
"""
Best parameters:  {'boosting_type': 'dart', 'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 20}
Best accuracy:  0.8933660359729035
Computation tmie: 375.2135384082794
"""

'\n\n'

In [22]:
# Create a GridSearchCV object and fit the data
start = time.time()
grid_search = GridSearchCV(model, param_grid=param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and accuracy score
print("Best parameters: ", grid_search.best_params_)
print("Best accuracy: ", grid_search.best_score_)
end = time.time()
print(f"Computation tmie: {end - start}")

Best parameters:  {'boosting_type': 'dart', 'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 20}
Best accuracy:  0.8933660359729035
Computation tmie: 375.2135384082794


In [16]:
param_grid = {
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'min_child_weight': [1, 3, 5],
}

model = XGBClassifier(tree_method='gpu_hist')


"""
Best parameters:  {'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 3, 'n_estimators': 300}
Best accuracy:  0.8903293622985284
Computation tmie: 519.5483028888702
"""