In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import hstack

import sklearn.metrics as metrics
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

from sklearn.neighbors import KNeighborsClassifier  
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier  
from sklearn.svm import SVC 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import re

import tensorflow_hub as hub
import tensorflow as tf

In [2]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
import pandas as pd
import numpy as np

import tokenization
#The file "tokenization" is forked from:
#https://github.com/google-research/bert/blob/master/tokenization.py.

#https://www.kaggle.com/wrrosa/keras-bert-using-tfhub-modified-train-data/data?select=tokenization.py

## Helper Functions  
Source for bert_encode function: https://www.kaggle.com/user123454321/bert-starter-inference

In [29]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [38]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    print(sequence_output)
    clf_output = sequence_output[:, 0, :]
    print(clf_output.shape)
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=1e-3), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [31]:
%%time
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=False)

CPU times: user 8.39 s, sys: 1.05 s, total: 9.45 s
Wall time: 9.35 s


In [32]:
#train = pd.read_csv("nlp-getting-started/train.csv")
#test = pd.read_csv("nlp-getting-started/test.csv")

train = df_train
#test = X_train.text
max_len = 90

In [33]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [34]:
train_input = bert_encode(train.text.values, tokenizer, max_len=max_len)
#test_input = bert_encode(test.text.values, tokenizer, max_len=max_len)
train_labels = train.target.values

In [35]:
print(len(train_input), train_input[0].shape)

3 (6989, 90)


In [None]:
train_input = train_input[0]
train_input.shape

In [None]:
clf_lr = LogisticRegression()
clf_lr.fit(train_input, train_labels)

predictions = clf_lr.predict(train_input)
print(metrics.confusion_matrix(train_labels, predictions))
# Print a classification report
print(metrics.classification_report(train_labels, predictions))
# Print the overall accuracy
print(metrics.accuracy_score(train_labels, predictions))

In [39]:
model = build_model(bert_layer, max_len=max_len)
model.summary()

Tensor("keras_layer/StatefulPartitionedCall_1:1", shape=(None, None, 1024), dtype=float32)
(None, 1024)
Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 90)]         0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 90)]         0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 90)]         0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 1024), (None 335141889   input_word_ids[0]

In [40]:
checkpoint = ModelCheckpoint('model.h5', monitor='val_accuracy', save_best_only=True)

train_history = model.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=4,
    callbacks=[checkpoint],
    batch_size=80,
    verbose=1
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


## Predict test dataset to submit

In [45]:
test = df_test

In [47]:
test_input = bert_encode(test.text.values, tokenizer, max_len=max_len)
test_pred = model.predict(test_input)
submission = train.truncate(after = -1)
submission['id'] = test['id']
submission['text'] = test['text']
submission['target'] = test_pred.round().astype(int)

In [55]:
#submission.drop(['text'], axis=1, inplace=True)
submission = submission[['id','target']]

In [56]:
submission.to_csv("nlp-getting-started/answer.csv", index=False)

# End of BERT 
_______________

In [3]:
df_train = pd.read_csv("nlp-getting-started/train.csv")
df_test = pd.read_csv("nlp-getting-started/test.csv")

In [4]:
display(df_train.head())
print(df_train.shape, df_test.shape)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


(7613, 5) (3263, 4)


In [5]:
df_train.loc[0,'text']

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [6]:
df_train.drop(["keyword","location"], axis=1, inplace=True)
df_test.drop(["keyword","location"], axis=1, inplace=True)

In [7]:
display(df_train.head(2))

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1


## Check for missing values:

In [8]:
df_train.isnull().sum()

id        0
text      0
target    0
dtype: int64

In [9]:
display(df_train.target.value_counts())

0    4342
1    3271
Name: target, dtype: int64

<font color=green>Не сильная разбалансированность. Оставим все как есть</font>

In [10]:
next(df_train.itertuples())

Pandas(Index=0, id=1, text='Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all', target=1)

## Detect & remove empty strings

In [11]:
blanks = []

for index, i, text, target in df_train.itertuples():  # iterate over the DataFrame
    if type(text)==str:            # avoid NaN values
        if text.isspace():         # test 'review' for whitespace
            blanks.append(i) 
        
print(len(blanks), 'blanks: ', blanks)

df_train.drop(blanks, inplace=True)

0 blanks:  []


# EDA

### 1. Избавимся от ссылок

In [12]:
line = df_train["text"].head(-5).values[-1]
print(line)

pattern = r'http://[/.\w]+'#cuz maybe https://... or http://  or just http
print(re.findall(pattern, line))

re.sub(pattern,'',line)

#stormchase Violent Record Breaking EF-5 El Reno Oklahoma Tornado Nearly Runs Over ... - http://t.co/3SICroAaNz http://t.co/I27Oa0HISp
['http://t.co/3SICroAaNz', 'http://t.co/I27Oa0HISp']


'#stormchase Violent Record Breaking EF-5 El Reno Oklahoma Tornado Nearly Runs Over ... -  '

In [13]:
def get_rid_of_link(text):
    raw_s = r'{}'.format(text)
    pattern = r'http[:/.\w]+'
    raw_s = re.sub(pattern,'',raw_s)
    return(raw_s)

df_train["text"] = df_train["text"].apply(get_rid_of_link)
df_test["text"] = df_test["text"].apply(get_rid_of_link)
df_train.to_csv("nlp-getting-started/train_without_link.csv", index=False)
df_test.to_csv("nlp-getting-started/test_without_link.csv", index=False)

### Создание features

### 2. Find time (am/pm/UTC/..)

In [14]:
pattern = r"[\d]+:[\d]+:[\d]+"
pattern_2 = r"[\d]+:[\d]+"
pattern_3 = r"(am|pm|UTC)"

pattern = r"[\d]+:[\d]+:[\d]+|[\d]+:[\d]+|am|pm|UTC"

line = r"Earthquake : M 3.4 - 96km N of Brenas Puerto Rico: Time2015-08-05 10:34:24 UTC2015-08-05 06:34:24 -4:00 atÛ_"
print(re.findall(pattern, line))
print(re.sub(pattern, "",line))
print(line)

line = r"Meow, Sparta"
print(re.findall(pattern, line))

['10:34:24', 'UTC', '06:34:24', '4:00']
Earthquake : M 3.4 - 96km N of Brenas Puerto Rico: Time2015-08-05  2015-08-05  - atÛ_
Earthquake : M 3.4 - 96km N of Brenas Puerto Rico: Time2015-08-05 10:34:24 UTC2015-08-05 06:34:24 -4:00 atÛ_
[]


In [15]:
def create_bool_time_feature(text):
    raw_s = r'{}'.format(text)
    pattern = r"[\d]+:[\d]+:[\d]+|[\d]+:[\d]+|am|pm|UTC"
    if(len(re.findall(pattern, raw_s))!=0):
        return(1)
    else:
        return(0)
    
def get_rid_of_time(text):
    raw_s = r'{}'.format(text)
    pattern = r"[\d]+:[\d]+:[\d]+|[\d]+:[\d]+|am|pm|UTC"
    raw_s = re.sub(pattern,'',raw_s)
    return(raw_s)



In [16]:
df_train["time"] = df_train["text"].apply(create_bool_time_feature)
df_test["time"] = df_test["text"].apply(create_bool_time_feature)

df_train["text"] = df_train["text"].apply(get_rid_of_time)
df_test["text"] = df_test["text"].apply(get_rid_of_time)

In [17]:
df_train["time"].value_counts()

0    5898
1    1715
Name: time, dtype: int64

### 3. Удалим дубликаты 
Есть дубликаты. Некоторые наблюдения полностью совпадают по "text", некоторые отличаются орфографической ошибкой в тексте.  
Удалим те, что полностью идентичны по feature "text" (значения "taget" порой разные)

In [18]:
print(f"Amount of observations: {df_train.text.shape},\nNumber of unique observations: {df_train.text.unique().shape}")

Amount of observations: (7613,),
Number of unique observations: (6989,)


In [19]:
df_train = df_train.drop_duplicates(subset=['text'])

### 4. Find Countries, cities, states 

In [20]:
'''
import spacy
nlp = spacy.load('en_core_web_md')#small version
'''

"\nimport spacy\nnlp = spacy.load('en_core_web_md')#small version\n"

In [21]:
'''
def create_bool_GPE(text):
    raw_s = nlp(u'{}'.format(text))
    GPE_list = [1 for val in raw_s.ents if val.label_=="GPE"]
    if(len(GPE_list)!=0):
        return(1)
    else:
        return(0)
    
def get_rid_of_GPE(text):
    raw_s = r'{}'.format(text)
    raw_s_nlp = nlp(u'{}'.format(text))
    GPE_list = [val.text for val in raw_s_nlp.ents if val.label_=="GPE"]
    for GPE_item in GPE_list:
        try:
            raw_s = re.sub(GPE_item, '', raw_s)
        except:
            pass
    return(raw_s)



doc8 = nlp(u'A Cessna airplane accident in Ocampo Coahuila Mexico on July 29 2015 killed four men including a State of Coahuila government official. Horrible Accident Man Died In Wings of Airplane (29-07-2015)')

GPE_list = [val.text for val in doc8.ents if val.label_=="GPE"]
print(GPE_list)
print(doc8)
print(get_rid_of_GPE(doc8))
'''

'\ndef create_bool_GPE(text):\n    raw_s = nlp(u\'{}\'.format(text))\n    GPE_list = [1 for val in raw_s.ents if val.label_=="GPE"]\n    if(len(GPE_list)!=0):\n        return(1)\n    else:\n        return(0)\n    \ndef get_rid_of_GPE(text):\n    raw_s = r\'{}\'.format(text)\n    raw_s_nlp = nlp(u\'{}\'.format(text))\n    GPE_list = [val.text for val in raw_s_nlp.ents if val.label_=="GPE"]\n    for GPE_item in GPE_list:\n        try:\n            raw_s = re.sub(GPE_item, \'\', raw_s)\n        except:\n            pass\n    return(raw_s)\n\n\n\ndoc8 = nlp(u\'A Cessna airplane accident in Ocampo Coahuila Mexico on July 29 2015 killed four men including a State of Coahuila government official. Horrible Accident Man Died In Wings of Airplane (29-07-2015)\')\n\nGPE_list = [val.text for val in doc8.ents if val.label_=="GPE"]\nprint(GPE_list)\nprint(doc8)\nprint(get_rid_of_GPE(doc8))\n'

In [22]:
'''
df_train["GPE"] = df_train["text"].apply(create_bool_GPE)
df_test["GPE"] = df_test["text"].apply(create_bool_GPE)

df_train["text"] = df_train["text"].apply(get_rid_of_GPE)
df_test["text"] = df_test["text"].apply(get_rid_of_GPE)

df_train["GPE"].value_counts()
'''

'\ndf_train["GPE"] = df_train["text"].apply(create_bool_GPE)\ndf_test["GPE"] = df_test["text"].apply(create_bool_GPE)\n\ndf_train["text"] = df_train["text"].apply(get_rid_of_GPE)\ndf_test["text"] = df_test["text"].apply(get_rid_of_GPE)\n\ndf_train["GPE"].value_counts()\n'

In [23]:
'''
df_train.to_csv("nlp-getting-started/df_train_cleaned.csv", index=False)
df_test.to_csv("nlp-getting-started/df_test_cleaned.csv", index=False)
'''

'\ndf_train.to_csv("nlp-getting-started/df_train_cleaned.csv", index=False)\ndf_test.to_csv("nlp-getting-started/df_test_cleaned.csv", index=False)\n'

### 5.  Удаление всех токенов вида цифры/цифры+слова

In [24]:
def get_rid_of_digits(text):
    raw_s = r'{}'.format(text)
    pattern = r"\d+\w+|\w+\d+"
    raw_s = re.sub(pattern,'',raw_s)
    return(raw_s)

df_train["text"] = df_train["text"].apply(get_rid_of_digits)

### 6. Удаление Тэгов (`#`... и @....) и слов с подчеркиванием (_ashj)

In [25]:
def get_rid_of_tags(text):
    raw_s = r'{}'.format(text)
    pattern = r"@\w+|#\w+|_+\w+|\w+_|"
    raw_s = re.sub(pattern,'',raw_s)
    return(raw_s)

df_train["text"] = df_train["text"].apply(get_rid_of_tags)

### 7. Отбор слов (создание списка stop_words)

In [26]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

vectorizer = TfidfVectorizer(min_df = 0, max_df = 5000, stop_words=ENGLISH_STOP_WORDS)

X_train_counts = vectorizer.fit(df_train["text"])
word_freq = X_train_counts.vocabulary_

word_freq = dict(sorted(word_freq.items(), key=lambda x: x[1], reverse=False))
word_freq

{'aa': 0,
 'aaaa': 1,
 'aaaaaaallll': 2,
 'aaaaaand': 3,
 'aaarrrgghhh': 4,
 'aan': 5,
 'aannnnd': 6,
 'aar': 7,
 'aashiqui': 8,
 'ab': 9,
 'aba': 10,
 'abandon': 11,
 'abandoned': 12,
 'abandoning': 13,
 'abbandoned': 14,
 'abbott': 15,
 'abbruchsimulator': 16,
 'abbswinston': 17,
 'abc': 18,
 'abcnews': 19,
 'abe': 20,
 'aberdeen': 21,
 'aberystwyth': 22,
 'abia': 23,
 'ability': 24,
 'abject': 25,
 'ablaze': 26,
 'able': 27,
 'aboard': 28,
 'abomination': 29,
 'abortion': 30,
 'abortions': 31,
 'abouts': 32,
 'abq': 33,
 'abs': 34,
 'absence': 35,
 'absolute': 36,
 'absolutely': 37,
 'abstract': 38,
 'absurd': 39,
 'absurdly': 40,
 'abuse': 41,
 'abused': 42,
 'abuses': 43,
 'abusing': 44,
 'ac': 45,
 'academia': 46,
 'acc': 47,
 'accept': 48,
 'accepte': 49,
 'accepts': 50,
 'access': 51,
 'accident': 52,
 'accidentally': 53,
 'accidently': 54,
 'accidents': 55,
 'accompanying': 56,
 'according': 57,
 'accordingly': 58,
 'account': 59,
 'accountable': 60,
 'accounts': 61,
 'accurac

In [27]:
word_list = list(word_freq.keys())

with open("word.txt","w+") as f:
    for word in word_list:
        f.write(word +'\n')

### 8. Stemming/Lemmatization

In [28]:
'''
import spacy
nlp = spacy.load('en_core_web_md')#small version

line = nlp(u'{}'.format(" ".join(word_list[5230:5240])))
for w in line:
    print(f"{w.text} {w.lemma_:>{20}}")
    
def lemmatization(text):
    raw_s = nlp(u'{}'.format(text))
    raw_s = " ".join( [word.lemma_ for word in raw_s] )
    return(raw_s)

#df_train["text"] = df_train["text"].apply(lemmatization)
line = "I am going to destroy everything on my way!"
print(line)
lemmatization(line)
'''

'\nimport spacy\nnlp = spacy.load(\'en_core_web_md\')#small version\n\nline = nlp(u\'{}\'.format(" ".join(word_list[5230:5240])))\nfor w in line:\n    print(f"{w.text} {w.lemma_:>{20}}")\n    \ndef lemmatization(text):\n    raw_s = nlp(u\'{}\'.format(text))\n    raw_s = " ".join( [word.lemma_ for word in raw_s] )\n    return(raw_s)\n\n#df_train["text"] = df_train["text"].apply(lemmatization)\nline = "I am going to destroy everything on my way!"\nprint(line)\nlemmatization(line)\n'

word dictionary дал идею о:  
- удалении ссылок
- cоздании binary features: время (чуть улучшило), дата, город/страна (чуть ухудшило logistic regression)

### Metrics Estimation

In [None]:
def metric_score(model): 

    # Form a prediction set
    predictions = model.predict(X_test)
    print(metrics.confusion_matrix(y_test,predictions))
    
    # Print a classification report
    print(metrics.classification_report(y_test,predictions))
    
    # Print the overall accuracy
    print(metrics.accuracy_score(y_test,predictions))

## Split the data into train & test sets:

In [None]:
X = df_train[['text','time']]#,'GPE']]  # this time we want to look at the text

#X = df_train['text']
y = df_train['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(y_train.shape, y_test.shape)

### Temp

In [None]:
stopwords = ['.','?','@','+',',','<','>','%','~','!','^','&','(',')',':',';']

In [None]:
vectorizer = TfidfVectorizer(min_df=2, max_df=0.6, stop_words=stopwords)
vectorizer = vectorizer.fit(X_train.text)

X_train_ = vectorizer.transform(X_train.text) # remember to use the original X_train set
X_test_ = vectorizer.transform(X_test.text)

#y_train = y_train.reshape(-1, 1)
#y_test = y_test.reshape(-1, 1)

X_train_.shape# return sparse matrix

<font color=blue>Words which left in X_train_</font>

In [None]:
word_freq = vectorizer.vocabulary_
word_freq = dict(sorted(word_freq.items(), key=lambda x: x[1], reverse=False))
print(len(word_freq))
word_freq

<font color=blue>Words which was recognized as *stop_words* and hence was deleted. This is result of params inside "TfidfVectorizer"</font>

In [None]:
print(len(vectorizer.stop_words_))
print(vectorizer.stop_words_)

In [None]:
'''
X_train_ = hstack((X_train_, np.array([X_train.time.values]).T))
X_test_ = hstack((X_test_, np.array([X_test.time.values]).T))

X_train = hstack((X_train_, np.array([X_train.GPE.values]).T))
X_test = hstack((X_test_, np.array([X_test.GPE.values]).T))
'''

X_train = hstack((X_train_, np.array([X_train.time.values]).T))
X_test = hstack((X_test_, np.array([X_test.time.values]).T))

In [None]:
#clf_lr = LogisticRegression()
#clf_lr.fit(X_train,y_train)
#metric_score(clf_lr)


model_list = [
             KNeighborsClassifier(n_neighbors=10),
             MultinomialNB(),
             DecisionTreeClassifier(),
    LGBMClassifier(),
    RandomForestClassifier(),
             SVC(kernel="linear"),
             LogisticRegression()]


for model in model_list[3:]:
    print(f"--------------\n{type(model)}")
    model.fit(X_train,y_train)
    metric_score(model)

### Let's try to choose custom threshold

In [None]:
predictions = clf_lr.predict(X_test)
print(predictions[:5])
#predict_proba()

predictions = clf_lr.predict_proba(X_test)
print(predictions[:5])

In [None]:
# calculate the fpr and tpr for all thresholds of the classification
probs = clf_lr.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

# method I: plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
def calculate_metrics(x_test, y_test):
    threshold_list = list()
    sensitivity_list = list()
    specificity_list = list()
    accuracy_list = list()

    probs = clf_lr.predict_proba(x_test)
    preds = probs[:,1]
    
    for threshold in np.arange(0.1,1,0.025):
        threshold = round(threshold,2)

        test_df = pd.DataFrame(y_test)
        test_df["predicted"] = preds

        test_df["predicted"] = test_df["predicted"].apply(lambda x: 1 if x >threshold else 0)
        CM = confusion_matrix(test_df.target, test_df.predicted)

        TN = CM[0][0]
        FN = CM[1][0]
        TP = CM[1][1]
        FP = CM[0][1]

        sensitivity = TP/(TP+FN)
        specificity = TN/(TN+FP)
        accuracy = (TP+TN)/(TP+FP+TN+FN)

        threshold_list.append(threshold)
        sensitivity_list.append(sensitivity)
        specificity_list.append(specificity)
        accuracy_list.append(accuracy)
    return([threshold_list,sensitivity_list, specificity_list, accuracy_list])

In [None]:
def plot_curves(x_min, x_max, x_test,y_test):
    threshold_list, sensitivity_list, specificity_list, accuracy_list = calculate_metrics(x_test, y_test)
    x = threshold_list

    fig=plt.figure(figsize=(6,6))
    fig.show()
    ax=fig.add_subplot(111)

    ax.plot(x,sensitivity_list,c='r',label='Sensitivity',fillstyle='none')
    ax.plot(x,specificity_list,c='blue',label='Specificity')
    ax.plot(x,accuracy_list,c='black',label='Accuracy')

    plt.xlim(x_min, x_max)

    plt.grid(True)
    plt.legend(loc=3)
    plt.draw()

In [None]:
plot_curves(0,1, X_test,y_test)

In [None]:
plot_curves(0.35,0.5, X_test, y_test)

## Let's train model on the whole training dataset

In [None]:
pronouns = ""
with open("pronouns.txt", "r") as f:
    pronouns = f.read()
#list(set(stopwords_pronouns))
pronouns_list = list(set(pronouns.split(', ')))

In [None]:
X = df_train[['text']]#,'GPE']]  # this time we want to look at the text

#X = df_train['text']
y = df_train['target']

stopwords = ['.','?','@','+',',','<','>','%','~','!','^','&','(',')',':',';']

stopwords_2 = ['a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
             'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
             'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
             'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=2, max_df=0.6, stop_words=None)

X_train = vectorizer.fit_transform(X.text) # remember to use the original X_train set
#X_train = hstack((X_train_, np.array([X.time.values]).T))
clf_lr = LogisticRegression()
clf_lr.fit(X_train,y)

predictions = clf_lr.predict(X_train)
print(metrics.confusion_matrix(y,predictions))
# Print a classification report
print(metrics.classification_report(y,predictions))
# Print the overall accuracy
print(metrics.accuracy_score(y,predictions))

In [None]:
plot_curves(0,1, X_train, y)

In [None]:
plot_curves(.36, .6, X_train, y)

## Predict and create csv for kaggle

In [None]:
X_test_df = df_test[['text']]#,'GPE']]  # this time we want to look at the text

X_test = vectorizer.transform(X_test_df.text) # remember to use the original X_train set
#X_test = hstack((X_test_, np.array([X_test_df.time.values]).T))

predictions = clf_lr.predict_proba(X_test)


In [None]:
df_test["target"] = predictions[:,1]
df_test.drop(["text"], axis=1,inplace=True)
df_test.drop(["time"], axis=1,inplace=True)

In [None]:
display(df_test.head(8))
df_test.target.value_counts()

In [None]:
df_test["target"] = df_test["target"].apply(lambda x: 1 if x > 0.5 else 0)

In [None]:
df_test.to_csv("nlp-getting-started/answer.csv", index=False)

## Build a Pipeline

In [None]:
stopwords = ['a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
             'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
             'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
             'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

text_clf = Pipeline([('tfidf', TfidfVectorizer(stop_words=None)),
                     ('clf', LinearSVC()),
])


# Naïve Bayes:
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer(stop_words=None)),
                     ('clf', MultinomialNB()),
])

text_clf_logistic_regression = Pipeline([('tfidf', TfidfVectorizer(stop_words=None)),
                                         ('clf', LogisticRegression()),
])


## Test the classifier and display results

In [None]:
text_clf.fit(X_train, y_train)  
text_clf_nb.fit(X_train, y_train)  
text_clf_logistic_regression.fit(X_train, y_train)  

models_dict = {"SVC": text_clf, "Naïve Bayes":text_clf_nb, 
               "Logistic_Reg.(threshold=0.5)":text_clf_logistic_regression}
for key, model in models_dict.items():
    print(key)
    metric_score(model)
    print("-"*55)

## Predict values on test dataset & save results

In [None]:
df_test.head()

In [None]:
df_test['text']

In [None]:
test_predictions = text_clf.predict(df_test['text'])
df_test["target"] = test_predictions

In [None]:
df_test.head()

In [None]:
df_test.drop(["text"], axis=1,inplace=True)

In [None]:
df_test.to_csv("nlp-getting-started/answer.csv", index=False)

## Чтобы попробывать  
* EDA:
    1. Dict of word frequency
    2. Based on 1, calculate p-value for top-N the most/last frequent 
    


* Logistic regression -> useing ROC to determine threshold value (not 0.5 maybe)  


* советы с https://machinelearningmastery.com?

### Вывод:  
Из Dict of word frequency понял, что следует убрать все ссылки  
Пример:  `http://t.co/zLvEbEoavG`.  
*Ссылки не всегда в конце текста!*

In [None]:
bert_model_name = 'small_bert/bert_en_uncased_L-12_H-768_A-12' 

map_name_to_handle = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_base/2',
    'electra_small':
        'https://tfhub.dev/google/electra_small/2',
    'electra_base':
        'https://tfhub.dev/google/electra_base/2',
    'experts_pubmed':
        'https://tfhub.dev/google/experts/bert/pubmed/2',
    'experts_wiki_books':
        'https://tfhub.dev/google/experts/bert/wiki_books/2',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
}

map_model_to_preprocess = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/2',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/2',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_preprocess/2',
    'electra_small':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'electra_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'experts_pubmed':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'experts_wiki_books':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')
