In [18]:
import pandas as pd
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from matplotlib import pyplot as plt
from sklearn.dummy import DummyRegressor
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [2]:
plt.rcParams['figure.facecolor'] = 'white'

In [3]:
%load_ext autoreload
%autoreload 2

#### Loading data

In [4]:
data = pd.read_csv('data2.csv', index_col=0)

In [6]:
data['hate_speech'].value_counts()

0    19790
1     3419
2     1251
3      287
4       21
5        7
6        5
7        3
Name: hate_speech, dtype: int64

In [7]:
data['offensive_language'].value_counts()

3    13383
2     4246
0     3475
1     2066
6      857
5      369
4      251
9       66
8       37
7       33
Name: offensive_language, dtype: int64

In [8]:
data.head()

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [9]:
data_sample = pd.DataFrame({
    'tweet': [
        'Sasha goes to Mannheim University',
        'Danylo will create a great application',
        'Danylo will not create a great application',
        'Dasha didnt fulfill her task'
    ],
    'class': [1, 1, 0, 0]
})

#### Preprocessing

In [10]:
from sklearn.pipeline import Pipeline
from notebooks.Dzim.web_mining.bert_processor import TextCleaner, SeriesConverter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from notebooks.Dzim.web_mining.feature_generation import FeatureGenerator

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [41]:
target = 'offensive_language'

In [42]:
# Split the data into train and test sets
data_train, data_test, y_train, y_test = train_test_split(data[['tweet']], data[target], test_size=0.2, random_state=42)

In [43]:
data_train.shape, data_test.shape

((19826, 1), (4957, 1))

In [14]:
# Define the pipeline
nlp = Pipeline([
    ("cleaning", TextCleaner()),
    ("series_converter", SeriesConverter()),
    ("vectorization", TfidfVectorizer(max_df=0.7, min_df=3)),
])

In [15]:
# Fit-transform the training data
X_train = nlp.fit_transform(data_train)  #.toarray()

In [16]:
# Transform the test data
X_test = nlp.transform(data_test)

#### Models

In [44]:
dummy_mean = make_pipeline(
    DummyRegressor(strategy='mean'),
)
dummy_median = make_pipeline(
    DummyRegressor(strategy='median'),
)

In [45]:
lr = make_pipeline(
    LinearRegression(),
)

In [46]:
ctb = make_pipeline(
    CatBoostRegressor()
)
xgb = make_pipeline(
    XGBRegressor()
)
lgbm = make_pipeline(
    LGBMRegressor()
)
rfc = make_pipeline(
    RandomForestRegressor(),
)

In [47]:
model = lr

In [48]:
lr.fit(X_train, y_train)

In [24]:
model.fit(X_train, y_train)

In [25]:
prediction = model.predict(X_test)

In [26]:
train_prediction = model.predict(X_train)

In [50]:
pipelines = [
    ("Dummy_mean", dummy_mean),
    ("Dummy_median", dummy_median),
    ("LinReg", lr),
    ("XGB", xgb),
    ("LGBM", lgbm),
    ("RFC", rfc),
    ("CatBoost", ctb),
]

#### Models

In [51]:
from src.common.validation.training import estimate_multiple_models, estimate_model

#### training
model_names, model_pipelines = zip(*pipelines)
metrics, weights = estimate_multiple_models(
    model_pipelines,
    lambda pipeline, _: estimate_model(
        pipeline,
        (X_train, X_test, y_train, y_test),
    ),
    model_names
)

Learning rate set to 0.065636
0:	learn: 1.3693677	total: 46.2ms	remaining: 46.2s
1:	learn: 1.3418065	total: 85.1ms	remaining: 42.5s
2:	learn: 1.3168720	total: 125ms	remaining: 41.4s
3:	learn: 1.2921843	total: 164ms	remaining: 40.8s
4:	learn: 1.2686201	total: 203ms	remaining: 40.5s
5:	learn: 1.2487987	total: 243ms	remaining: 40.3s
6:	learn: 1.2311247	total: 287ms	remaining: 40.8s
7:	learn: 1.2158588	total: 335ms	remaining: 41.5s
8:	learn: 1.2011711	total: 376ms	remaining: 41.4s
9:	learn: 1.1892899	total: 418ms	remaining: 41.4s
10:	learn: 1.1782585	total: 458ms	remaining: 41.1s
11:	learn: 1.1667145	total: 505ms	remaining: 41.6s
12:	learn: 1.1574264	total: 549ms	remaining: 41.7s
13:	learn: 1.1491575	total: 592ms	remaining: 41.7s
14:	learn: 1.1423765	total: 634ms	remaining: 41.6s
15:	learn: 1.1355302	total: 673ms	remaining: 41.4s
16:	learn: 1.1290997	total: 717ms	remaining: 41.4s
17:	learn: 1.1243147	total: 760ms	remaining: 41.5s
18:	learn: 1.1195798	total: 804ms	remaining: 41.5s
19:	learn

In [52]:
metrics

set,train,train,train,train,train,train,train,train,test,test,test,test,test,test,test,test
metric,mape,rmspe,mse,rmse,mae,r2,adjusted_r2,rmsle,mape,rmspe,mse,rmse,mae,r2,adjusted_r2,rmsle
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Dummy_mean,1529570000000000.0,384.155,1.97,1.403,1.056,0.0,-0.244,0.531,1512947000000000.0,379.609,1.913,1.383,1.046,-0.0,-0.244,0.528
Dummy_median,1898572000000000.0,364.241,2.31,1.52,0.932,-0.173,-0.458,0.585,1877938000000000.0,360.342,2.272,1.507,0.929,-0.188,-0.477,0.583
LinReg,488376600000000.0,349.207,0.829,0.91,0.638,0.579,0.477,0.309,752350600000000.0,371.071,1.517,1.232,0.869,0.207,0.014,0.416
XGB,493021800000000.0,339.85,0.743,0.862,0.538,0.623,0.531,0.3,554687800000000.0,349.754,1.034,1.017,0.633,0.46,0.328,0.342
LGBM,439334300000000.0,346.834,0.859,0.927,0.57,0.564,0.458,0.301,469970500000000.0,349.338,0.973,0.987,0.621,0.491,0.367,0.325
RFC,118617200000000.0,309.075,0.184,0.429,0.251,0.907,0.884,0.133,318488900000000.0,354.781,1.072,1.036,0.646,0.439,0.303,0.324
CatBoost,536236300000000.0,345.061,0.842,0.918,0.585,0.573,0.468,0.32,568895500000000.0,350.26,1.033,1.016,0.639,0.46,0.328,0.343


In [322]:
for name, _ in pipelines:
    try:
        print("#" * 80)
        print(name)
        print(weights.loc[name])
    except KeyError:
        print('Not Supported')

################################################################################
Dummy_mean
Not Supported
################################################################################
Dummy_median
Not Supported
################################################################################
LinReg
         weight
feature        
x155      6.861
x6788     6.015
x4997     5.476
x4998     5.027
x6853     4.392
...         ...
x4445    -2.665
x627     -3.027
x2765    -3.027
x3679    -3.296
x1523    -4.132

[6993 rows x 1 columns]
################################################################################
XGB
Not Supported
################################################################################
LGBM
             weight
feature            
Column_6788   0.056
Column_155    0.053
Column_6382   0.051
Column_4998   0.039
Column_4997   0.030
...             ...
Column_4579   0.000
Column_4580   0.000
Column_4581   0.000
Column_4582   0.000
Column_0      0.000

[6992 rows x 1 colu

In [53]:
final_pipeline_1 = Pipeline([
    ('NLP', nlp),
    ('regressor', rfc[-1])
])

final_pipeline_2 = Pipeline([
    ('NLP', nlp),
    ('regressor', lgbm[-1])
])

final_pipeline_3 = Pipeline([
    ('NLP', nlp),
    ('regressor', ctb[-1])
])

final_pipeline_4 = Pipeline([
    ('NLP', nlp),
    ('regressor', lr[-1])
])

In [54]:
final_pipeline_2

In [55]:
final_pipeline_2.predict(pd.DataFrame({
    'tweet': [
        'Sasha goes to Mannheim University',
        '@Danylo will create a great application',
        '@You bitch suck a dick fuck you shit in your ass',
        'He will break it',
    ]}))

array([0.92845382, 0.93764724, 3.03106446, 1.12615349])

In [56]:
from src.common.validation.metrics import aggregate_metrics
from src.common.prediction_model.persistence import FsModelPersistence
from src.common.prediction_model.prediction_model import ModelContainer, ModelMetadata

container = ModelContainer.create(
    pipeline_name='l1_data2_lgbm_reg_offensive_language',
    pipeline=final_pipeline_2,
    feature_names=[],
    metadata=ModelMetadata(
        model_name=str(final_pipeline_2.__class__.__name__),
        **aggregate_metrics(metrics)['test'].loc["LGBM"]
    )
)

In [57]:
# %%
persistor = FsModelPersistence()
name = persistor.save(container)

In [58]:
name

'l1_data2_lgbm_reg_offensive_language_Pipeline_2023-05-17.bin.gz'

#### Hyperparameter optimization

In [343]:
from sklearn.model_selection import GridSearchCV
import time

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [20, 50, 100, 200],
    'max_depth': [None, 2, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Create a RandomForestClassifier object
model = RandomForestClassifier()

"""
Best parameters:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best accuracy:  0.9592866361609763
Computation tmie: 245.936190366745
"""

In [347]:
# Define the parameter grid to search over
param_grid = {
    'iterations': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5],
    'random_strength': [0.1, 0.5, 1],
}
model = CatBoostClassifier()
"""

"""

'\n\n'

In [349]:
param_grid = {
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'min_child_weight': [1, 3, 5],
}

model = XGBClassifier(tree_method='gpu_hist')

In [None]:
# Create a GridSearchCV object and fit the data
start = time.time()
grid_search = GridSearchCV(model, param_grid=param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and accuracy score
print("Best parameters: ", grid_search.best_params_)
print("Best accuracy: ", grid_search.best_score_)
end = time.time()
print(f"Computation tmie: {end - start}")