In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import hstack

import sklearn.metrics as metrics
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

from sklearn.neighbors import KNeighborsClassifier  
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier  
from sklearn.svm import SVC 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import re

In [2]:
df_train = pd.read_csv("nlp-getting-started/train.csv")
df_test = pd.read_csv("nlp-getting-started/test.csv")

In [3]:
display(df_train.head())
print(df_train.shape, df_test.shape)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


(7613, 5) (3263, 4)


In [4]:
df_train.loc[0,'text']

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [5]:
df_train.drop(["keyword","location"], axis=1, inplace=True)
df_test.drop(["keyword","location"], axis=1, inplace=True)

In [6]:
display(df_train.head(2))

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1


## Check for missing values:

In [7]:
df_train.isnull().sum()

id        0
text      0
target    0
dtype: int64

In [8]:
display(df_train.target.value_counts())

0    4342
1    3271
Name: target, dtype: int64

<font color=green>Не сильная разбалансированность. Оставим все как есть</font>

In [9]:
next(df_train.itertuples())

Pandas(Index=0, id=1, text='Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all', target=1)

## Detect & remove empty strings

In [10]:
blanks = []

for index, i, text, target in df_train.itertuples():  # iterate over the DataFrame
    if type(text)==str:            # avoid NaN values
        if text.isspace():         # test 'review' for whitespace
            blanks.append(i) 
        
print(len(blanks), 'blanks: ', blanks)

df_train.drop(blanks, inplace=True)

0 blanks:  []


# EDA

### 1. Избавимся от ссылок

In [11]:
line = df_train["text"].head(-5).values[-1]
print(line)

pattern = r'http://[/.\w]+'#cuz maybe https://... or http://  or just http
print(re.findall(pattern, line))

re.sub(pattern,'',line)

#stormchase Violent Record Breaking EF-5 El Reno Oklahoma Tornado Nearly Runs Over ... - http://t.co/3SICroAaNz http://t.co/I27Oa0HISp
['http://t.co/3SICroAaNz', 'http://t.co/I27Oa0HISp']


'#stormchase Violent Record Breaking EF-5 El Reno Oklahoma Tornado Nearly Runs Over ... -  '

In [12]:
def get_rid_of_link(text):
    raw_s = r'{}'.format(text)
    pattern = r'http[:/.\w]+'
    raw_s = re.sub(pattern,'',raw_s)
    return(raw_s)

df_train["text"] = df_train["text"].apply(get_rid_of_link)
df_test["text"] = df_test["text"].apply(get_rid_of_link)
df_train.to_csv("nlp-getting-started/train_without_link.csv", index=False)
df_test.to_csv("nlp-getting-started/test_without_link.csv", index=False)

### Создание features

### 2. Find time (am/pm/UTC/..)

In [13]:
pattern = r"[\d]+:[\d]+:[\d]+"
pattern_2 = r"[\d]+:[\d]+"
pattern_3 = r"(am|pm|UTC)"

pattern = r"[\d]+:[\d]+:[\d]+|[\d]+:[\d]+|am|pm|UTC"

line = r"Earthquake : M 3.4 - 96km N of Brenas Puerto Rico: Time2015-08-05 10:34:24 UTC2015-08-05 06:34:24 -4:00 atÛ_"
print(re.findall(pattern, line))
print(re.sub(pattern, "",line))
print(line)

line = r"Meow, Sparta"
print(re.findall(pattern, line))

['10:34:24', 'UTC', '06:34:24', '4:00']
Earthquake : M 3.4 - 96km N of Brenas Puerto Rico: Time2015-08-05  2015-08-05  - atÛ_
Earthquake : M 3.4 - 96km N of Brenas Puerto Rico: Time2015-08-05 10:34:24 UTC2015-08-05 06:34:24 -4:00 atÛ_
[]


In [14]:
def create_bool_time_feature(text):
    raw_s = r'{}'.format(text)
    pattern = r"[\d]+:[\d]+:[\d]+|[\d]+:[\d]+|am|pm|UTC"
    if(len(re.findall(pattern, raw_s))!=0):
        return(1)
    else:
        return(0)
    
def get_rid_of_time(text):
    raw_s = r'{}'.format(text)
    pattern = r"[\d]+:[\d]+:[\d]+|[\d]+:[\d]+|am|pm|UTC"
    raw_s = re.sub(pattern,'',raw_s)
    return(raw_s)



In [15]:
df_train["time"] = df_train["text"].apply(create_bool_time_feature)
df_test["time"] = df_test["text"].apply(create_bool_time_feature)

df_train["text"] = df_train["text"].apply(get_rid_of_time)
df_test["text"] = df_test["text"].apply(get_rid_of_time)

In [16]:
df_train["time"].value_counts()

0    5898
1    1715
Name: time, dtype: int64

### 3. Удалим дубликаты 
Есть дубликаты. Некоторые наблюдения полностью совпадают по "text", некоторые отличаются орфографической ошибкой в тексте.  
Удалим те, что полностью идентичны по feature "text" (значения "taget" порой разные)

In [17]:
print(f"Amount of observations: {df_train.text.shape},\nNumber of unique observations: {df_train.text.unique().shape}")

Amount of observations: (7613,),
Number of unique observations: (6989,)


In [18]:
df_train = df_train.drop_duplicates(subset=['text'])

### 4. Find Countries, cities, states 

In [None]:
'''
import spacy
nlp = spacy.load('en_core_web_md')#small version
'''

In [None]:
'''
def create_bool_GPE(text):
    raw_s = nlp(u'{}'.format(text))
    GPE_list = [1 for val in raw_s.ents if val.label_=="GPE"]
    if(len(GPE_list)!=0):
        return(1)
    else:
        return(0)
    
def get_rid_of_GPE(text):
    raw_s = r'{}'.format(text)
    raw_s_nlp = nlp(u'{}'.format(text))
    GPE_list = [val.text for val in raw_s_nlp.ents if val.label_=="GPE"]
    for GPE_item in GPE_list:
        try:
            raw_s = re.sub(GPE_item, '', raw_s)
        except:
            pass
    return(raw_s)



doc8 = nlp(u'A Cessna airplane accident in Ocampo Coahuila Mexico on July 29 2015 killed four men including a State of Coahuila government official. Horrible Accident Man Died In Wings of Airplane (29-07-2015)')

GPE_list = [val.text for val in doc8.ents if val.label_=="GPE"]
print(GPE_list)
print(doc8)
print(get_rid_of_GPE(doc8))
'''

In [None]:
'''
df_train["GPE"] = df_train["text"].apply(create_bool_GPE)
df_test["GPE"] = df_test["text"].apply(create_bool_GPE)

df_train["text"] = df_train["text"].apply(get_rid_of_GPE)
df_test["text"] = df_test["text"].apply(get_rid_of_GPE)

df_train["GPE"].value_counts()
'''

In [None]:
'''
df_train.to_csv("nlp-getting-started/df_train_cleaned.csv", index=False)
df_test.to_csv("nlp-getting-started/df_test_cleaned.csv", index=False)
'''

## Word dictionary. Отбор признаков

In [None]:
vectorizer = TfidfVectorizer(min_df = 3, max_df = 5000)

X_train_counts = vectorizer.fit(df_train["text"])
word_freq = X_train_counts.vocabulary_
word_freq = dict(sorted(word_freq.items(), key=lambda x: x[1], reverse=True))
#word_freq

word dictionary дал идею о:  
- удалении ссылок
- cоздании binary features: время (чуть улучшило), дата, город/страна (чуть ухудшило logistic regression)

### Metrics Estimation

In [19]:
def metric_score(model): 

    # Form a prediction set
    predictions = model.predict(X_test)
    print(metrics.confusion_matrix(y_test,predictions))
    
    # Print a classification report
    print(metrics.classification_report(y_test,predictions))
    
    # Print the overall accuracy
    print(metrics.accuracy_score(y_test,predictions))

## Split the data into train & test sets:

In [None]:
X = df_train[['text','time']]#,'GPE']]  # this time we want to look at the text

#X = df_train['text']
y = df_train['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(y_train.shape, y_test.shape)

### Temp

In [None]:
stopwords = ['.','?','@','+',',','<','>','%','~','!','^','&','(',')',':',';']

In [None]:
vectorizer = TfidfVectorizer(min_df=2, max_df=0.6, stop_words=stopwords)
vectorizer = vectorizer.fit(X_train.text)

X_train_ = vectorizer.transform(X_train.text) # remember to use the original X_train set
X_test_ = vectorizer.transform(X_test.text)

#y_train = y_train.reshape(-1, 1)
#y_test = y_test.reshape(-1, 1)

X_train_.shape# return sparse matrix

<font color=blue>Words which left in X_train_</font>

In [None]:
word_freq = vectorizer.vocabulary_
word_freq = dict(sorted(word_freq.items(), key=lambda x: x[1], reverse=False))
print(len(word_freq))
word_freq

<font color=blue>Words which was recognized as *stop_words* and hence was deleted. This is result of params inside "TfidfVectorizer"</font>

In [None]:
print(len(vectorizer.stop_words_))

In [None]:
'''
X_train_ = hstack((X_train_, np.array([X_train.time.values]).T))
X_test_ = hstack((X_test_, np.array([X_test.time.values]).T))

X_train = hstack((X_train_, np.array([X_train.GPE.values]).T))
X_test = hstack((X_test_, np.array([X_test.GPE.values]).T))
'''

X_train = hstack((X_train_, np.array([X_train.time.values]).T))
X_test = hstack((X_test_, np.array([X_test.time.values]).T))

In [None]:
#clf_lr = LogisticRegression()
#clf_lr.fit(X_train,y_train)
#metric_score(clf_lr)


model_list = [
             KNeighborsClassifier(n_neighbors=10),
             MultinomialNB(),
             DecisionTreeClassifier(),
    LGBMClassifier(),
    RandomForestClassifier(),
             SVC(kernel="linear"),
             LogisticRegression()]


for model in model_list[3:]:
    print(f"--------------\n{type(model)}")
    model.fit(X_train,y_train)
    metric_score(model)

### Let's try to choose custom threshold

In [None]:
predictions = clf_lr.predict(X_test)
print(predictions[:5])
#predict_proba()

predictions = clf_lr.predict_proba(X_test)
print(predictions[:5])

In [None]:
# calculate the fpr and tpr for all thresholds of the classification
probs = clf_lr.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

# method I: plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
def calculate_metrics(x_test, y_test):
    threshold_list = list()
    sensitivity_list = list()
    specificity_list = list()
    accuracy_list = list()

    probs = clf_lr.predict_proba(x_test)
    preds = probs[:,1]
    
    for threshold in np.arange(0.1,1,0.025):
        threshold = round(threshold,2)

        test_df = pd.DataFrame(y_test)
        test_df["predicted"] = preds

        test_df["predicted"] = test_df["predicted"].apply(lambda x: 1 if x >threshold else 0)
        CM = confusion_matrix(test_df.target, test_df.predicted)

        TN = CM[0][0]
        FN = CM[1][0]
        TP = CM[1][1]
        FP = CM[0][1]

        sensitivity = TP/(TP+FN)
        specificity = TN/(TN+FP)
        accuracy = (TP+TN)/(TP+FP+TN+FN)

        threshold_list.append(threshold)
        sensitivity_list.append(sensitivity)
        specificity_list.append(specificity)
        accuracy_list.append(accuracy)
    return([threshold_list,sensitivity_list, specificity_list, accuracy_list])

In [None]:
def plot_curves(x_min, x_max, x_test,y_test):
    threshold_list, sensitivity_list, specificity_list, accuracy_list = calculate_metrics(x_test, y_test)
    x = threshold_list

    fig=plt.figure(figsize=(6,6))
    fig.show()
    ax=fig.add_subplot(111)

    ax.plot(x,sensitivity_list,c='r',label='Sensitivity',fillstyle='none')
    ax.plot(x,specificity_list,c='blue',label='Specificity')
    ax.plot(x,accuracy_list,c='black',label='Accuracy')

    plt.xlim(x_min, x_max)

    plt.grid(True)
    plt.legend(loc=3)
    plt.draw()

In [None]:
plot_curves(0,1, X_test,y_test)

In [None]:
plot_curves(0.35,0.5, X_test, y_test)

## Let's train model on the whole training dataset

In [20]:
X = df_train[['text','time']]#,'GPE']]  # this time we want to look at the text

#X = df_train['text']
y = df_train['target']

stopwords = ['.','?','@','+',',','<','>','%','~','!','^','&','(',')',':',';']
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=2, max_df=0.6, stop_words=stopwords)

X_train_ = vectorizer.fit_transform(X.text) # remember to use the original X_train set
X_train = hstack((X_train_, np.array([X.time.values]).T))
clf_lr = LogisticRegression()
clf_lr.fit(X_train,y)

predictions = clf_lr.predict(X_train)
print(metrics.confusion_matrix(y,predictions))
# Print a classification report
print(metrics.classification_report(y,predictions))
# Print the overall accuracy
print(metrics.accuracy_score(y,predictions))

[[3936  186]
 [ 728 2139]]
              precision    recall  f1-score   support

           0       0.84      0.95      0.90      4122
           1       0.92      0.75      0.82      2867

    accuracy                           0.87      6989
   macro avg       0.88      0.85      0.86      6989
weighted avg       0.88      0.87      0.87      6989

0.8692230648161396


In [21]:
plot_curves(0,1, X_train, y)

NameError: name 'plot_curves' is not defined

In [None]:
plot_curves(.36, .6, X_train, y)

## Predict and create csv for kaggle

In [22]:
X_test_df = df_test[['text','time']]#,'GPE']]  # this time we want to look at the text

X_test_ = vectorizer.transform(X_test_df.text) # remember to use the original X_train set
X_test = hstack((X_test_, np.array([X_test_df.time.values]).T))

predictions = clf_lr.predict_proba(X_test)


In [23]:
df_test["target"] = predictions[:,1]
df_test.drop(["text"], axis=1,inplace=True)
df_test.drop(["time"], axis=1,inplace=True)

In [24]:
display(df_test.head(8))
df_test.target.value_counts()

Unnamed: 0,id,target
0,0,0.654548
1,2,0.474613
2,3,0.721908
3,9,0.443755
4,11,0.852724
5,12,0.58351
6,21,0.216445
7,22,0.120387


0.502754    11
0.519928    10
0.269756    10
0.219005     8
0.832629     6
            ..
0.614678     1
0.121832     1
0.405145     1
0.646753     1
0.241227     1
Name: target, Length: 3039, dtype: int64

In [25]:
df_test["target"] = df_test["target"].apply(lambda x: 1 if x > 0.5 else 0)

In [26]:
df_test.to_csv("nlp-getting-started/answer.csv", index=False)

## Build a Pipeline

In [None]:
stopwords = ['a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
             'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
             'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
             'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

text_clf = Pipeline([('tfidf', TfidfVectorizer(stop_words=None)),
                     ('clf', LinearSVC()),
])


# Naïve Bayes:
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer(stop_words=None)),
                     ('clf', MultinomialNB()),
])

text_clf_logistic_regression = Pipeline([('tfidf', TfidfVectorizer(stop_words=None)),
                                         ('clf', LogisticRegression()),
])


## Test the classifier and display results

In [None]:
text_clf.fit(X_train, y_train)  
text_clf_nb.fit(X_train, y_train)  
text_clf_logistic_regression.fit(X_train, y_train)  

models_dict = {"SVC": text_clf, "Naïve Bayes":text_clf_nb, 
               "Logistic_Reg.(threshold=0.5)":text_clf_logistic_regression}
for key, model in models_dict.items():
    print(key)
    metric_score(model)
    print("-"*55)

## Predict values on test dataset & save results

In [None]:
df_test.head()

In [None]:
df_test['text']

In [None]:
test_predictions = text_clf.predict(df_test['text'])
df_test["target"] = test_predictions

In [None]:
df_test.head()

In [None]:
df_test.drop(["text"], axis=1,inplace=True)

In [None]:
df_test.to_csv("nlp-getting-started/answer.csv", index=False)

## Чтобы попробывать  
* EDA:
    1. Dict of word frequency
    2. Based on 1, calculate p-value for top-N the most/last frequent 
    


* Logistic regression -> useing ROC to determine threshold value (not 0.5 maybe)  


* советы с https://machinelearningmastery.com?

### Вывод:  
Из Dict of word frequency понял, что следует убрать все ссылки  
Пример:  `http://t.co/zLvEbEoavG`.  
*Ссылки не всегда в конце текста!*