**Real or Not? NLP with Disaster Tweets**<br>
https://www.kaggle.com/c/nlp-getting-started

In [29]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import numba
import os

**1. loading data**

In [34]:
train = pd.read_csv('rawdata/train.csv')
test = pd.read_csv('rawdata/test.csv')


df = train
print("===== training data =====")
print(df)
print(df.isna().sum()/len(df)*100)
print()

df = test
print("===== test data =====")
print(df)
print(df.isna().sum()/len(df)*100)

===== training data =====
         id keyword location  \
0         1     NaN      NaN   
1         4     NaN      NaN   
2         5     NaN      NaN   
3         6     NaN      NaN   
4         7     NaN      NaN   
...     ...     ...      ...   
7608  10869     NaN      NaN   
7609  10870     NaN      NaN   
7610  10871     NaN      NaN   
7611  10872     NaN      NaN   
7612  10873     NaN      NaN   

                                                   text  target  
0     Our Deeds are the Reason of this #earthquake M...       1  
1                Forest fire near La Ronge Sask. Canada       1  
2     All residents asked to 'shelter in place' are ...       1  
3     13,000 people receive #wildfires evacuation or...       1  
4     Just got sent this photo from Ruby #Alaska as ...       1  
...                                                 ...     ...  
7608  Two giant cranes holding a bridge collapse int...       1  
7609  @aria_ahrary @TheTawniest The out of control w...      

欠損値が多すぎて, locationが現時点で使えなさそう.

**TF-IDF and Logistic Regression**

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer

features = "text"
target = "target"

# prepare vectorizer
vectorizer = TfidfVectorizer() #text => vector
vectorizer.fit(train[features])

X_train = vectorizer.transform(train[features])
X_train = X_train.toarray()
Y_train = train[target]

# train model
model = LogisticRegression()
model.fit(X_train,Y_train)


# predict
data = pd.concat([train,test],axis=0)
X = vectorizer.transform(data[features]).toarray()
pred = model.predict(X)

# concat to submit file
data["target"] = pred

# load submission file
submission = pd.read_csv(os.path.join("rawdata","sample_submission.csv"))

# my submission file
data = data.set_index(data["id"])
data = data.sort_index()


mysub = data.iloc[submission["id"]]
mysub[["id","target"]].to_csv(os.path.join("rawdata","my_submission.csv"),index=False)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [42]:
data.iloc[submission["id"]]

Unnamed: 0_level_0,id,keyword,location,target,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,,,1,Just happened a terrible car crash
2,2,,,0,"Heard about #earthquake is different cities, s..."
3,3,,,1,"there is a forest fire at spot pond, geese are..."
9,9,,,0,Apocalypse lighting. #Spokane #wildfires
11,11,,,1,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...,...
10861,10861,,,1,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
10865,10865,,,1,Storm in RI worse than last hurricane. My city...
10868,10868,,,1,Green Line derailment in Chicago http://t.co/U...
10874,10874,,,1,MEG issues Hazardous Weather Outlook (HWO) htt...


**3. Clean Data**  - So we have a baseline score of 79% to work with , let's get to clean data and see if we can improve the score

As first step in cleaning - let us replace some commonly occuring shorthands 

In [21]:

def clean_text(text):
    import re
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"you'll", "you will", text)
    text = re.sub(r"i'll", "i will", text)
    text = re.sub(r"she'll", "she will", text)
    text = re.sub(r"he'll", "he will", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"there's", "there is", text)
    text = re.sub(r"here's", "here is", text)
    text = re.sub(r"who's", "who is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"shouldn't", "should not", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"   ", " ", text) # Remove any extra spaces
    return text


df_train['clean_text'] = df_train['text'].apply(clean_text)
df_test['clean_text'] = df_test['text'].apply(clean_text)

In the next step we are going to do some further massaging which would make Job of Prediction Algorithm easy

* Let us remove any characters other then alphabets
* Convert all dictionary to lower case - for consistency 
* Lemmatize - More details on Stemming and Lemmatization [here](https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html)


Also we are going to store this text in a seperate column as we want to keep the orignal text in case we want to do some feature engineering down the line.

In [29]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download("wordnet")


def massage_text(text):
    
    
    ## remove anything other then characters and put everything in lowercase
    tweet = re.sub("[^a-zA-Z]", ' ', text)
    tweet = tweet.lower()
    tweet = tweet.split()

    
    
    lem = WordNetLemmatizer()
    tweet = [lem.lemmatize(word) for word in tweet
             if word not in set(stopwords.words('english'))]
    tweet = ' '.join(tweet)
    return tweet
    print('--here goes nothing')
    print(text)
    print(tweet)

df_train['clean_text'] = df_train['text'].apply(massage_text)
df_test['clean_text'] = df_test['text'].apply(massage_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yonezu.t/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/yonezu.t/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


Let's take a look at the data now 

In [None]:
df_train.iloc[0:10][['text','clean_text']]

**4. Creation of more Models**

4.1 Start by creating a Logistic Regression model again , this time we will use Grid Seach for hyper-parameter optimization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

vector = TfidfVectorizer().fit(df_train['clean_text'])
df_train_vector = vector.transform(df_train['clean_text'])
df_test_vector = vector.transform(df_test['clean_text'])
lr_model = LogisticRegression()
grid_values =  {'penalty':['l1', 'l2'],'C':[0.01, 0.1, 1, 10, 100]}
grid_search_model = GridSearchCV(lr_model,param_grid=grid_values,cv=3)
grid_search_model.fit(df_train_vector,df_train['target'])

print(grid_search_model.best_estimator_)
print(grid_search_model.best_score_)
print(grid_search_model.best_params_)

## dumping the output to a file 
predict_df = pd.DataFrame()
predict = grid_search_model.predict(df_test_vector)
predict_df['id'] = df_test['id']
predict_df['target'] = predict
predict_df.to_csv('sample_submission_2.csv', index=False)
score_df = score_df.append({'Model Description':'LR Model - with data cleaning and Grid Search',
                           'Score':grid_search_model.best_score_}
                           ,ignore_index=True)


### let's have another model with some ngram's though 
X_train,X_test,y_train,y_test = train_test_split(df_train['clean_text'],df_train['target'])
vector = TfidfVectorizer(ngram_range=(1,3)).fit(X_train)
X_train_vector = vector.transform(X_train)
X_test_vector = vector.transform(X_test)

lr_model = LogisticRegression(C=1,penalty='l2').fit(X_train_vector,y_train)
predict = lr_model.predict(X_test_vector)
score = roc_auc_score(y_test,predict)
print('Roc AUC curve for LR and TFIDF with ngrams  - %3f'%score)

score_df = score_df.append({'Model Description':'LR Model - with ngram range',
                           'Score':score}
                           ,ignore_index=True)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

### let's have another model with some ngram's though 
X_train,X_test,y_train,y_test = train_test_split(df_train['clean_text'],df_train['target'])
vector = TfidfVectorizer(ngram_range=(1,3)).fit(X_train)
X_train_vector = vector.transform(X_train)
X_test_vector = vector.transform(X_test)

lr_model = LogisticRegression(C=1,penalty='l2').fit(X_train_vector,y_train)
predict = lr_model.predict(X_test_vector)
score = roc_auc_score(y_test,predict)
print('Roc AUC curve for LR and TFIDF with ngrams  - %3f'%score)

score_df = score_df.append({'Model Description':'LR Model - with ngram range',
                           'Score':grid_search_model.score}
                           ,ignore_index=True)

vector = TfidfVectorizer(ngram_range=(1,3)).fit(df_train['clean_text'])
X_train_vector = vector.transform(df_train['clean_text'])
X_test_vector = vector.transform(df_test['clean_text'])
lr_model = LogisticRegression(C=1,penalty='l2').fit(X_train_vector,df_train['target'])
predict = lr_model.predict(X_test_vector)


## dumping the output to a file 
predict_df = pd.DataFrame()
predict_df['id'] = df_test['id']
predict_df['target'] = predict
predict_df.to_csv('sample_submission_001.csv', index=False)


In [None]:
pd.concat([df_test,predict_df['target']],axis=1)

### you could dump this in a csv and do further analysis to check what
### misclassifications are there manually ,observations could then be used 
### to further tweak stuff


4.2 Let's apply Gaussian NB to the data 

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = \
        train_test_split(df_train['clean_text'], df_train['target'], random_state=20)
## Apply Tfidf tranformation
vector = TfidfVectorizer().fit(X_train)
X_train_vector = vector.transform(X_train)
X_test_vector  = vector.transform(X_test)
df_test_vector = vector.transform(df_test['clean_text'])

gb_model= GaussianNB().fit(X_train_vector.todense(),y_train)
predict = gb_model.predict(X_test_vector.todense())

print('Roc AUC score - %3f'%(roc_auc_score(y_test,predict)))
score_df = score_df.append({'Model Description':'Naive Bayes',
                           'Score':roc_auc_score(y_test,predict)}
                           ,ignore_index=True)


4.3 Support Vector Classifier - with Grid search to Optimize parameters

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

vector = TfidfVectorizer().fit(df_train['clean_text'])
df_train_vector = vector.transform(df_train['clean_text'])
df_test_vector = vector.transform(df_test['clean_text'])

svc_model = SVC()
grid_values={'kernel':['linear', 'poly', 'rbf'],'C':[0.001,0.01,1,10]}
grid_search_model= GridSearchCV(svc_model,param_grid=grid_values,cv=3)
grid_search_model.fit(df_train_vector,df_train['target'])

print(grid_search_model.best_estimator_)
print(grid_search_model.best_score_)
print(grid_search_model.best_params_)

score_df = score_df.append({'Model Description':'SVC - with Grid Search',
                           'Score':grid_search_model.best_score_}
                           ,ignore_index=True)

predict = grid_search_model.predict(df_test_vector)
predict_df = pd.DataFrame()
predict_df['id'] = df_test['id']
predict_df['target'] = predict

# # print(predict_df.head(5))
predict_df.to_csv('sample_submission_4.csv', index=False)


Let's look at score_df which has scores of all models till now and let's sort the output in ascending based on the Score

In [None]:
score_df[['Model Description','Score']]

**Please Upvote if you found the notebook usefull.**

4. More data cleaning/ Feature Engineering

In [None]:
#########  Word Count

import seaborn as sns
from matplotlib import pyplot as plt


# df_train



df_train['word_count'] = df_train['text'].apply(lambda x : len(x.lower().split()))
df_test['word_count'] = df_test['text'].apply(lambda x : len(x.lower().split()))


print('Average word count for non-disaster tweet - %.3f \
      '%(df_train[df_train['target']==0]['word_count'].mean()))

      
print('Average word count for disaster tweet - %.3f \
      '%(df_train[df_train['target']==1]['word_count'].mean()))

plt.figure(figsize=(12,6))
sns.boxplot(x='target',y='word_count',data=df_train)



In [None]:
#### Unique Word Count

df_train['u_word_count'] = df_train['text'].apply(lambda x : len(set(x.lower().split())))
df_test['u_word_count'] = df_test['text'].apply(lambda x : len(set(x.lower().split())))

print('Average word count for non disaster tweet - %.3f \
      '%(df_train[df_train['target']==0]['u_word_count'].mean()))

      
print('Average word count for disaster tweet - %.3f \
      '%(df_train[df_train['target']==1]['u_word_count'].mean()))


In [None]:
#### Stop word count

from nltk.corpus import stopwords

stop_words = set (stopwords.words("english"))
temp ='here you go then'
df_train['stop_word_count']=df_train['text'].apply(lambda x : \
                                                   len([w for w in x.split() if w in stop_words] ))
df_test['stop_word_count']=df_test['text'].apply(lambda x : \
                                                   len([w for w in x.split() if w in stop_words] ))


print('Average word count for non-disaster tweet - %.3f \
      '%(df_train[df_train['target']==0]['stop_word_count'].mean()))

      
print('Average word count for disaster tweet - %.3f \
      '%(df_train[df_train['target']==1]['stop_word_count'].mean()))


In [None]:
##### URL count
import re


df_train['url_count'] = df_train['text'].apply(lambda x: len(re.findall('http[s]*:',x)))
df_test['url_count'] = df_test['text'].apply(lambda x: len(re.findall('http[s]*:',x)))


In [None]:
## mean word length count 

df_train['mean_word_l_count'] = df_train.apply(lambda r : len(r['text'])/r['word_count'],axis=1)
df_test['mean_word_l_count'] = df_test.apply(lambda r : len(r['text'])/r['word_count'],axis=1)


print(df_test[['mean_word_l_count','text']])

In [None]:
## hash_count  count

df_train['hash_count'] = df_train['text'].apply(lambda x: len(re.findall('#',x)))
df_test['hash_count'] = df_test['text'].apply(lambda x: len(re.findall('#',x)))


In [None]:
### mention count @


## hash_count  count

df_train['mention_count'] = df_train['text'].apply(lambda x: len(re.findall('@',x)))
df_test['mention_count'] = df_test['text'].apply(lambda x: len(re.findall('@',x)))


In [None]:
import multiprocessing

from gensim.models import Word2Vec

cores= multiprocessing.cpu_count()

w2v_model = Word2Vec(min_count=5,
                     window = 2,
                     sample=6e-5,
                     alpha=0.03,
                     min_alpha=0.0007,
                     workers=3
                     )



w2v_model.build_vocab(df_train['text'],progress_per=50)


w2v_model.train(df_train['text'],total_examples=w2v_model.corpus_count,epochs=30,report_delay=1)

w2v_model.init_sims(replace=True)

print(w2v_model.wv.most_similar(positive=["closed"]))