In [1]:
import pandas as pd
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from matplotlib import pyplot as plt
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from xgboost import XGBClassifier

from notebooks.Dzim.data_mining.validation.training import estimate_multiple_models, estimate_model

In [2]:
plt.rcParams['figure.facecolor'] = 'white'

In [3]:
%load_ext autoreload
%autoreload 2

#### Loading data

In [4]:
data = pd.read_csv('data4.csv')

In [31]:
data = data.loc[data['label'].isin(['hate', 'noHate'])]
data['label'] = (data['label'] == 'hate').astype(int)
data.columns = ['tweet', 'label']

In [5]:
data.head()

Unnamed: 0,text,label,label_text
0,@EvanBecker513 @_TriggaPlease_ @MorbidMermaid ...,1,offensive language
1,"""@SnakecharmsDump: http://t.co/4b427dQZCV"" oh ...",1,offensive language
2,Dat bitch mouth made da honor roll,1,offensive language
3,&#8220;@Al_Teez: Fuck the cowboys. My birthday...,1,offensive language
4,WashPost Describes Charlie Crist's Bizarre Obs...,2,neither


In [6]:
data.label.value_counts()

1    17658
2     3805
0     1320
Name: label, dtype: int64

In [34]:
data_sample = pd.DataFrame({
    'tweet': [
        'Sasha goes to Mannheim University',
        'Danylo will create a great application',
        'Danylo will not create a great application',
        'Dasha didnt fulfill her task'
    ],
    'class': [1, 1, 0, 0]
})

#### Preprocessing

In [35]:
from sklearn.pipeline import Pipeline
from notebooks.Dzim.web_mining.bert_processor import TextCleaner, SeriesConverter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from notebooks.Dzim.web_mining.feature_generation import FeatureGenerator

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\dzmit\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [38]:
target = 'label'

In [39]:
# Split the data into train and test sets
data_train, data_test, y_train, y_test = train_test_split(data[['tweet']], data[target], test_size=0.2, random_state=42)

In [40]:
data_train.shape, data_test.shape

((8562, 1), (2141, 1))

In [14]:
# Define the pipeline
nlp = Pipeline([
    ("cleaning", TextCleaner()),
    ("series_converter", SeriesConverter()),
    ("vectorization", TfidfVectorizer(max_df=0.7, min_df=3)),
])

In [15]:
# Fit-transform the training data
X_train = nlp.fit_transform(data_train)  #.toarray()

In [16]:
# Transform the test data
X_test = nlp.transform(data_test)

#### Models

In [19]:
dummy_mean = make_pipeline(
    DummyClassifier(strategy='most_frequent'),
)
dummy_median = make_pipeline(
    DummyClassifier(strategy='stratified'),
)

In [20]:
lr = make_pipeline(
    LogisticRegression(),
)

In [21]:
ctb = make_pipeline(
    CatBoostClassifier()
)
xgb = make_pipeline(
    XGBClassifier()
)
lgbm = make_pipeline(
    LGBMClassifier()
)
rfc = make_pipeline(
    RandomForestClassifier(),
)

In [22]:
model = lr

In [23]:
lr.fit(X_train, y_train)

In [24]:
model.fit(X_train, y_train)

In [25]:
prediction = model.predict(X_test)

In [26]:
train_prediction = model.predict(X_train)

In [29]:
pipelines = [
    ("Dummy_mean", dummy_mean),
    ("Dummy_median", dummy_median),
    ("LinReg", lr),
    ("XGB", xgb),
    ("LGBM", lgbm),
    ("RFC", rfc),
    ("CatBoost", ctb),
]

#### Models

In [None]:
from src.common.validation.training import estimate_multiple_models, estimate_model

#### training
model_names, model_pipelines = zip(*pipelines)
metrics, weights = estimate_multiple_models(
    model_pipelines,
    lambda pipeline, _: estimate_model(
        pipeline,
        (X_train, X_test, y_train, y_test),
    ),
    model_names
)

In [22]:
metrics

set,train,train,train,train,test,test,test,test
metric,f1_score,cohen_kappa_score,precision_score,recall_score,f1_score,cohen_kappa_score,precision_score,recall_score
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Dummy_mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Dummy_median,0.072,0.001,0.071,0.072,0.066,-0.004,0.068,0.064
LGBM,0.662,0.644,0.941,0.51,0.56,0.536,0.775,0.439
RFC,0.997,0.996,0.999,0.994,0.647,0.628,0.889,0.509
CatBoost,0.68,0.663,0.958,0.527,0.589,0.568,0.861,0.447


In [322]:
for name, _ in pipelines:
    try:
        print("#" * 80)
        print(name)
        print(weights.loc[name])
    except KeyError:
        print('Not Supported')

################################################################################
Dummy_mean
Not Supported
################################################################################
Dummy_median
Not Supported
################################################################################
LinReg
         weight
feature        
x155      6.861
x6788     6.015
x4997     5.476
x4998     5.027
x6853     4.392
...         ...
x4445    -2.665
x627     -3.027
x2765    -3.027
x3679    -3.296
x1523    -4.132

[6993 rows x 1 columns]
################################################################################
XGB
Not Supported
################################################################################
LGBM
             weight
feature            
Column_6788   0.056
Column_155    0.053
Column_6382   0.051
Column_4998   0.039
Column_4997   0.030
...             ...
Column_4579   0.000
Column_4580   0.000
Column_4581   0.000
Column_4582   0.000
Column_0      0.000

[6992 rows x 1 colu

In [49]:
final_pipeline_1 = Pipeline([
    ('NLP', nlp),
    ('regressor', rfc[-1])
])

final_pipeline_2 = Pipeline([
    ('NLP', nlp),
    ('regressor', lgbm[-1])
])

final_pipeline_3 = Pipeline([
    ('NLP', nlp),
    ('regressor', ctb[-1])
])

final_pipeline_4 = Pipeline([
    ('NLP', nlp),
    ('regressor', lr[-1])
])

In [50]:
final_pipeline_4

In [51]:
final_pipeline_4.predict_proba(pd.DataFrame({
    'tweet': [
        'Sasha goes to Mannheim University',
        '@Danylo will create a great application',
        '@You bitch suck a dick fuck you shit in your ass',
        'He will break it',
    ]}))

array([[0.94499346, 0.05500654],
       [0.92117189, 0.07882811],
       [0.77374667, 0.22625333],
       [0.97815614, 0.02184386]])

In [52]:
from src.common.validation.metrics import aggregate_metrics
from src.common.prediction_model.persistence import FsModelPersistence
from src.common.prediction_model.prediction_model import ModelContainer, ModelMetadata

container = ModelContainer.create(
    pipeline_name='l1_data2_lr',
    pipeline=final_pipeline_4,
    feature_names=[],
    metadata=ModelMetadata(
        model_name=str(final_pipeline_4.__class__.__name__),
    )
    **aggregate_metrics(metrics)['test'].loc["LGBM"]
)

In [53]:
# %%
persistor = FsModelPersistence()
name = persistor.save(container)

In [54]:
name

'l1_data2_lr_Pipeline_2023-05-16.bin.gz'

#### Hyperparameter optimization

In [343]:
from sklearn.model_selection import GridSearchCV
import time

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [20, 50, 100, 200],
    'max_depth': [None, 2, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Create a RandomForestClassifier object
model = RandomForestClassifier()

"""
Best parameters:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best accuracy:  0.9592866361609763
Computation tmie: 245.936190366745
"""

In [347]:
# Define the parameter grid to search over
param_grid = {
    'iterations': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5],
    'random_strength': [0.1, 0.5, 1],
}
model = CatBoostClassifier()
"""

"""

'\n\n'

In [349]:
param_grid = {
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'min_child_weight': [1, 3, 5],
}

model = XGBClassifier(tree_method='gpu_hist')

In [None]:
# Create a GridSearchCV object and fit the data
start = time.time()
grid_search = GridSearchCV(model, param_grid=param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and accuracy score
print("Best parameters: ", grid_search.best_params_)
print("Best accuracy: ", grid_search.best_score_)
end = time.time()
print(f"Computation tmie: {end - start}")