In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from scripts.VectorizationStrategy import TfidfPreprocessingStrategy
from scripts.FeatureNumberReducingStrategy import TruncatedSVDStrategy
from scripts.ClassificationContext import ClassificationContext
from scripts.ClassificationStrategy import GaussianNBStrategy, LogisticRegressionStrategy, \
    SVCStrategy, RandomForestStrategy, LightGBMStrategy


df = pd.read_csv('../data/sarcasm_detection.csv', index_col=0)
df

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...
...,...,...,...
26704,0,american politics in moral free-fall,https://www.huffingtonpost.com/entry/american-...
26705,0,america's best 20 hikes,https://www.huffingtonpost.com/entry/americas-...
26706,0,reparations and obama,https://www.huffingtonpost.com/entry/reparatio...
26707,0,israeli ban targeting boycott supporters raise...,https://www.huffingtonpost.com/entry/israeli-b...


In [7]:
# temp_df = df.iloc[:1000,:]
X = df.drop('is_sarcastic', axis=1)
y = df['is_sarcastic']

In [51]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
model_results = {}

In [11]:
fc_strategies = [GaussianNBStrategy, LogisticRegressionStrategy,
                 SVCStrategy, RandomForestStrategy, LightGBMStrategy]
for fc_st in fc_strategies:
    for svd_n_components in [None, 3000]:
        name = fc_st.__name__ + '_SVD_' + str(svd_n_components)
        print(name)
        svd = None
        if svd_n_components:
            svd = TruncatedSVDStrategy(svd_n_components)
        context = ClassificationContext(TfidfPreprocessingStrategy(), fc_st(), svd)

        # roc_auc = context.cross_validate(X_train, y_train)
        # print(f"Mean roc_auc: {roc_auc}\n")

        y_pred = context.execute_strategy(X_train, y_train, X_test)
        roc_auc = roc_auc_score(y_test, y_pred)
        model_results[name] = roc_auc
        print(f"roc_auc_score on test set: {roc_auc}\n")

        context.serialize_model(name + '.pkl')

GaussianNBStrategy_SVD_None
roc_auc_score on test set: 0.7665627982928773

LogisticRegressionStrategy_SVD_None
roc_auc_score on test set: 0.8223783096539737

SVCStrategy_SVD_None
roc_auc_score on test set: 0.829560248915956

RandomForestStrategy_SVD_None
roc_auc_score on test set: 0.9428091530388729

LightGBMStrategy_SVD_None
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.241531 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 67025
[LightGBM] [Info] Number of data points in the train set: 44262, number of used features: 2946
[LightGBM] [Info] Start training from score 0.457774
roc_auc_score on test set: 0.8615065778920767

GaussianNBStrategy_SVD_3000
roc_auc_score on test set: 0.565046747342153

LogisticRegressionStrategy_SVD_3000
roc_auc_score on test set: 0.8040096150559181

SVCStrategy_SVD_3000
roc_auc_score on test set: 0.807238179

In [16]:
model_results

{'GaussianNBStrategy_SVD_None': 0.7665627982928773,
 'LogisticRegressionStrategy_SVD_None': 0.8223783096539737,
 'SVCStrategy_SVD_None': 0.829560248915956,
 'RandomForestStrategy_SVD_None': 0.9428091530388729,
 'LightGBMStrategy_SVD_None': 0.8615065778920767,
 'GaussianNBStrategy_SVD_3000': 0.565046747342153,
 'LogisticRegressionStrategy_SVD_3000': 0.8040096150559181,
 'SVCStrategy_SVD_3000': 0.8072381793846258,
 'RandomForestStrategy_SVD_3000': 0.916143216080402,
 'LightGBMStrategy_SVD_3000': 0.890468630436326}

    {'GaussianNBStrategy_SVD_None': 0.7665627982928773,
     'LogisticRegressionStrategy_SVD_None': 0.8223783096539737,
     'SVCStrategy_SVD_None': 0.829560248915956,
     'RandomForestStrategy_SVD_None': 0.9428091530388729,
     'LightGBMStrategy_SVD_None': 0.8615065778920767,
     'GaussianNBStrategy_SVD_3000': 0.565046747342153,
     'LogisticRegressionStrategy_SVD_3000': 0.8040096150559181,
     'SVCStrategy_SVD_3000': 0.8072381793846258,
     'RandomForestStrategy_SVD_3000': 0.916143216080402,
     'LightGBMStrategy_SVD_3000': 0.890468630436326}

In [29]:
res = pd.DataFrame({'model': model_results.keys(), 'score': model_results.values()})
res = res.sort_values(by='score', ascending=False)
res

Unnamed: 0,model,score
3,RandomForestStrategy_SVD_None,0.942809
8,RandomForestStrategy_SVD_3000,0.916143
9,LightGBMStrategy_SVD_3000,0.890469
4,LightGBMStrategy_SVD_None,0.861507
2,SVCStrategy_SVD_None,0.82956
1,LogisticRegressionStrategy_SVD_None,0.822378
7,SVCStrategy_SVD_3000,0.807238
6,LogisticRegressionStrategy_SVD_3000,0.80401
0,GaussianNBStrategy_SVD_None,0.766563
5,GaussianNBStrategy_SVD_3000,0.565047


**Найкращий результут був отриманий за допомогою RandomForest без застосування TruncatedSVD.**

In [52]:
import pickle
from scripts.preprocessing import base_text_preprocessing


best_model_str = 'RandomForestStrategy_SVD_None'
best_model = pickle.load(open(f'../models/{best_model_str}.pkl', 'rb'))

X_train = base_text_preprocessing(X_train.iloc[:, 0])
X_test = base_text_preprocessing(X_test.iloc[:, 0])
X_train, X_test = TfidfPreprocessingStrategy().execute(X_train, X_test)
predictions = best_model.predict(X_test)
print(predictions)

[0 0 0 ... 0 1 1]


In [55]:
df_predictions = y_test.to_frame().rename(columns={'is_sarcastic': 'fact'})
df_predictions['predict'] = predictions
df_predictions.to_csv('../data/predictions.csv', index=True)

### using cross-validation

In [27]:
for fc_st in [LogisticRegressionStrategy]:
    for svd_n_components in [None]:
        name = fc_st.__name__ + '_SVD_' + str(svd_n_components)
        print(name)
        svd = None
        if svd_n_components:
            svd = TruncatedSVDStrategy(svd_n_components)
        context = ClassificationContext(TfidfPreprocessingStrategy(), fc_st(), svd)

        roc_auc = context.cross_validate(X, y)
        print(f"Mean roc_auc: {roc_auc}\n")

LogisticRegressionStrategy_SVD_None
[    0     1     2 ... 55325 55326 55327]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[    1     2     3 ... 55325 55326 55327]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[    0     1     2 ... 55323 55325 55326]
[    0     1     3 ... 55324 55326 55327]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[    0     2     3 ... 55324 55325 55327]
Mean roc_auc: 0.8147256070383996

