### Imports

In [1]:
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
import seaborn as sns
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.feature_extraction.text import CountVectorizer
import spacy
from tqdm import tqdm
import pickle
from nltk.corpus import stopwords
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from nltk.stem.porter import PorterStemmer
from collections import Counter


from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

from lightautoml.automl.presets.text_presets import TabularNLPAutoML
from lightautoml.tasks import Task
from lightautoml.addons.interpretation import LimeTextExplainer, L2XTextExplainer
from lightautoml.report import ReportDecoNLP

# Выключим предупреждения от HuggingFace
import transformers
transformers.logging.set_verbosity(50)

### Loading datasets

In [4]:
df_sample_submission = pd.read_csv('datasets/sample_submission.csv')
df_test = pd.read_csv('datasets/test.csv')
df_train = pd.read_csv('datasets/train.csv')

In [2]:
pd.set_option('max_colwidth',None) #setting columnd width so we can see whole text

### EDA

In [5]:
display(df_sample_submission.sample(3))
display(df_test.sample(3))
display(df_train.sample(3))

Unnamed: 0,id,target
2016,6782,0
2000,6740,0
1224,4015,0


Unnamed: 0,id,keyword,location,text
2685,8950,storm,AUSTRALIA,Warcraft 3-Inspired Mode Likely Hitting Heroes of the Storm http://t.co/848CVWWdOt
1370,4506,emergency,Isle of Patmos,No matter the dilemma emergency nor set-back; The Righteousness of JAH shall prevail! http://t.co/n0tIy7SU1C
17,51,ablaze,NIGERIA,#PreviouslyOnDoyinTv: Toke MakinwaÛªs marriage crisis sets Nigerian Twitter ablaze... http://t.co/CMghxBa2XI


Unnamed: 0,id,keyword,location,text,target
6457,9238,suicide%20bombing,,Remembering Mordechai Yehuda Friedman 24 of Ramat Beit Shemesh; murdered by Hamas terrorists in the suicide bombing of Egged bus No. 361,1
1140,1644,bombing,WorldWide,#Australia #News ; #Japan marks 70th anniversary of #Hiroshima atomic bombing http://t.co/7aD0L7cgee READ MORE; http://t.co/hHzQl9tzNP,1
1443,2081,casualty,Virginia,@AvBronstein @Popehat @instapundit @KurtSchlichter Also are you aware of the casualty estimates for an invasion of Japan's home islands?,1


In [6]:
train, test = train_test_split(df_train, test_size=0.2, random_state=42, stratify=df_train.target)

In [None]:
#!wget https://storage.yandexcloud.net/natasha-navec/packs/navec_hudlit_v1_12B_500K_300d_100q.tar


**Simple AutoML Model, CPU**

In [9]:
roles = {'target': 'target',
         'text': ['text'],
        }

task = Task('binary')

automl = TabularNLPAutoML(task = task, 
                          timeout = 3600,
                          gpu_ids = None,
                          text_params = {'lang': 'en'},
                          )

oof_pred = automl.fit_predict(train, roles=roles, verbose=3) 
not_nan = np.any(~np.isnan(oof_pred.data), axis=1)

[22:05:34] Stdout logging level is INFO3.
[22:05:34] Model language mode: en
[22:05:34] Task: binary

[22:05:34] Start automl preset with listed constraints:
[22:05:34] - time: 3600.00 seconds
[22:05:34] - CPU: 4 cores
[22:05:34] - memory: 16 GB

[22:05:34] [1mTrain data shape: (6090, 5)[0m

[22:05:37] Feats was rejected during automatic roles guess: []
[22:05:37] Layer [1m1[0m train process start. Time left 3597.68 secs
[22:05:40] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[22:05:40] ===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m =====
[22:05:40] Linear model: C = 1e-05 score = 0.7724355261711917
[22:05:40] Linear model: C = 5e-05 score = 0.7732938901709897
[22:05:40] Linear model: C = 0.0001 score = 0.7740101835433701
[22:05:40] Linear model: C = 0.0005 score = 0.7771258369544017
[22:05:40] Linear model: C = 0.001 score = 0.778514366954075
[22:05:40] Linear model: C = 0.005 score = 0.7842001621684235
[22:05:40] Linear model: C = 0.01 sc

100%|████████████████████████████████████████████████████████████████████████████| 6090/6090 [00:04<00:00, 1372.98it/s]


[22:05:51] Feature concated__text fitted
[22:05:55] Feature concated__text transformed
[22:05:55] Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...
[22:05:55] ===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m =====
[22:05:55] Training until validation scores don't improve for 200 rounds
[22:05:57] ===== Start working with [1mfold 1[0m for [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m =====
[22:05:57] Training until validation scores don't improve for 200 rounds
[22:05:59] ===== Start working with [1mfold 2[0m for [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m =====
[22:05:59] Training until validation scores don't improve for 200 rounds
[22:06:01] Fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m finished. score = [1m0.8109565895145487[0m
[22:06:01] [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m fitting and predicting completed
[22:06:01] Time left 3573.03 secs

[22:06:01] [1mLayer 1 training completed.[0m

[22:06:01] Blending: optimization starts with equal weights and score [1m0

In [10]:
test_pred = automl.predict(test)
predictions=((test_pred.data[:, 0] > 0.5)*1)

print('f1', f1_score(test[roles['target']].values,predictions))
print('accuracy', accuracy_score(test[roles['target']].values,predictions))
print('precision', precision_score(test[roles['target']].values,predictions))
print('recall', recall_score(test[roles['target']].values,predictions))

[22:06:05] Feature concated__text transformed
f1 0.747181964573269
accuracy 0.793827971109652
precision 0.7891156462585034
recall 0.709480122324159


Let's try moving threshold

In [12]:


f1_best = 0
threshold=0
for i in tqdm(range(40,60,1)):
    predictions=((test_pred.data[:, 0] > (i/100))*1)
    f1=f1_score(test[roles['target']].values,predictions)
 
    if f1 > f1_best:
        f1_best=f1
        threshold =i
print(f1_best)
print(threshold)

100%|████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 1002.70it/s]

0.7559523809523808
43





Lets' make submission

In [14]:
test_pred = automl.predict(df_test)


[22:09:00] Feature concated__text transformed


In [18]:
predictions=((test_pred.data[:, 0] > 0.45)*1)
df_test['target']=np.nan
df_test['target']=predictions

df_sample_submission = df_test[['id','target']]

df_sample_submission['target']=df_sample_submission['target'].astype('int64')

df_sample_submission.head(3)

df_sample_submission.to_csv('df_sample_sub_26-10-auto_ml_CPU.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample_submission['target']=df_sample_submission['target'].astype('int64')


In [19]:
df_sample_submission.head(3)

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1


**Kaggle Score: 0.76034**

**Bert Pooling**

In [28]:
roles = {'target': 'target',
         'text': ['text'],
        }

task = Task('binary')

automl = TabularNLPAutoML(task = task, 
                          timeout = 3600,
                          gpu_ids = '1',
                          general_params = {'use_algos': ['linear_l2', 'lgb']},
                          text_params = {'lang': 'en'},
                          autonlp_params={'model_name': 'pooled_bert'},
                          )

oof_pred = automl.fit_predict(train, roles=roles, verbose=3) 
not_nan = np.any(~np.isnan(oof_pred.data), axis=1)

[22:17:21] Stdout logging level is INFO3.
[22:17:21] Model language mode: en
[22:17:21] Task: binary

[22:17:21] Start automl preset with listed constraints:
[22:17:21] - time: 3600.00 seconds
[22:17:21] - CPU: 4 cores
[22:17:21] - memory: 16 GB

[22:17:21] [1mTrain data shape: (6090, 5)[0m

[22:17:21] Feats was rejected during automatic roles guess: []
[22:17:21] Layer [1m1[0m train process start. Time left 3599.89 secs
[22:17:24] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[22:17:24] ===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m =====
[22:17:24] Linear model: C = 1e-05 score = 0.7724355261711917
[22:17:24] Linear model: C = 5e-05 score = 0.7732938901709897
[22:17:24] Linear model: C = 0.0001 score = 0.7740101835433701
[22:17:24] Linear model: C = 0.0005 score = 0.7771258369544017
[22:17:24] Linear model: C = 0.001 score = 0.778514366954075
[22:17:24] Linear model: C = 0.005 score = 0.7842001621684235
[22:17:24] Linear model: C = 0.01 sc

100%|████████████████████████████████████████████████████████████████████████████| 6090/6090 [00:04<00:00, 1349.82it/s]


[22:17:35] Feature concated__text fitted
[22:17:40] Feature concated__text transformed
[22:17:40] Start fitting [1mLvl_1_Pipe_0_Mod_0_LightGBM[0m ...
[22:17:40] ===== Start working with [1mfold 0[0m for [1mLvl_1_Pipe_0_Mod_0_LightGBM[0m =====
[22:17:40] Training until validation scores don't improve for 200 rounds
[22:17:42] ===== Start working with [1mfold 1[0m for [1mLvl_1_Pipe_0_Mod_0_LightGBM[0m =====
[22:17:42] Training until validation scores don't improve for 200 rounds
[22:17:43] ===== Start working with [1mfold 2[0m for [1mLvl_1_Pipe_0_Mod_0_LightGBM[0m =====
[22:17:43] Training until validation scores don't improve for 200 rounds
[22:17:46] Fitting [1mLvl_1_Pipe_0_Mod_0_LightGBM[0m finished. score = [1m0.8294414546365153[0m
[22:17:46] [1mLvl_1_Pipe_0_Mod_0_LightGBM[0m fitting and predicting completed
[22:17:46] Time left 3575.07 secs

[22:17:46] [1mLayer 2 training completed.[0m

[22:17:46] [1mAutoml preset training completed in 24.94 seconds[0m

[22:1

In [29]:
test_pred = automl.predict(test)
predictions=((test_pred.data[:, 0] > 0.5)*1)

print('f1', f1_score(test[roles['target']].values,predictions))
print('accuracy', accuracy_score(test[roles['target']].values,predictions))
print('precision', precision_score(test[roles['target']].values,predictions))
print('recall', recall_score(test[roles['target']].values,predictions))

[22:17:50] Feature concated__text transformed
f1 0.740066225165563
accuracy 0.793827971109652
precision 0.8068592057761733
recall 0.6834862385321101


Let's try moving threshold since precisiond and recall are imbalanced

In [30]:
f1_best = 0
threshold=0
for i in tqdm(range(30,70,1)):
    predictions=((test_pred.data[:, 0] > (i/100))*1)
    f1=f1_score(test[roles['target']].values,predictions)
    
    if f1 > f1_best:
        f1_best=f1
        threshold =i
print(f1_best,threshold)

100%|████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 1055.44it/s]

0.7610062893081759 44





**Submission**

In [31]:
test_pred = automl.predict(df_test)
predictions=((test_pred.data[:, 0] > 0.46)*1)
df_test['target']=np.nan
df_test['target']=predictions

df_sample_submission = df_test[['id','target']]

df_sample_submission['target']=df_sample_submission['target'].astype('int64')

df_sample_submission.head(3)

df_sample_submission.to_csv('df_sample_sub_26-10-auto_ml_GPU_Bert_Pooling.csv',index=False)

[22:19:02] Feature concated__text transformed


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample_submission['target']=df_sample_submission['target'].astype('int64')


In [32]:
df_sample_submission

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,0
3,9,1
4,11,1
...,...,...
3258,10861,0
3259,10865,1
3260,10868,1
3261,10874,1


**Kaggle Score: 0.76708**