# Importing All the necessary Libraries

In [None]:
#Importing all the necessary libraries
import numpy as np 
import pandas as pd 

# text processing libraries
import re
import string
import nltk
from nltk.corpus import stopwords

# XGBoost
import xgboost as xgb
from xgboost import XGBClassifier

# sklearn 
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV,StratifiedKFold,RandomizedSearchCV

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

**Reading the Dataset**

In [None]:
df=pd.read_csv('../input/disaster-tweets/tweets.csv')
print('The shape of the dataset=',df.shape)
df.head()

In [None]:
#Basic EDA
df.isnull().sum()

The columns denote the following:

1. The text of a tweet

2. A keyword from that tweet

3. The location the tweet was sent from

4. target-Whether it is a disaster tweet or not

Exploring the Target Column

-Distribution of the Target Column
-We have to predict whether a given tweet is about a real disaster or not. - If so, predict a 1. If not, predict a 0.

In [None]:
#Exploring the target columns
df['target'].value_counts()

In [None]:
sns.barplot(df['target'].value_counts().index,df['target'].value_counts(),palette='rocket')

Exploring the Target Column Let's look at what the disaster and the non disaster tweets look like

In [None]:
# A Disaster tweet
disaster_tweets=df[df['target']==1]['text']
disaster_tweets.values[1]

In [None]:
# Non Disaster tweets
nondisaster_tweets=df[df['target']==0]['text']
nondisaster_tweets.values[1]

Exploring the 'keyword' column

The keyword column denotes a keyword from the tweet.Let's look at the top 20 keywords in the training data

In [None]:
sns.barplot(y=df['keyword'].value_counts()[:30].index,x=df['keyword'].value_counts()[:30])

Let's see how often the word 'disaster' come in the dataset and whether this help us in determining 
whether a tweet belongs to a disaster category or not.

In [None]:
df.loc[df['text'].str.contains('disaster',na=False,case=False)].target.value_counts()

Exploring the 'location' column

Even though the column location has a number of missing values, let's see the top 20 locations present in the dataset. Since some of the locations are repeated, this will require some bit of cleaning.

In [None]:
df['location'].value_counts().head(10)

In [None]:
# Replacing the ambigious locations name with Standard names
df['location'].replace({'United States':'USA',
                           'New York':'USA',
                            "London":'UK',
                            "Los Angeles, CA":'USA',
                            "Washington, D.C.":'USA',
                            "California":'USA',
                             "Chicago, IL":'USA',
                             "Chicago":'USA',
                            "New York, NY":'USA',
                            "California, USA":'USA',
                            "FLorida":'USA',
                            "Nigeria":'Africa',
                            "Kenya":'Africa',
                            "Everywhere":'Worldwide',
                            "San Francisco":'USA',
                            "Florida":'USA',
                            "United Kingdom":'UK',
                            "Los Angeles":'USA',
                            "Toronto":'Canada',
                            "San Francisco, CA":'USA',
                            "NYC":'USA',
                            "Seattle":'USA',
                            "Earth":'Worldwide',
                            "Ireland":'UK',
                            "London, England":'UK',
                           "New York City":'USA',
                            "Texas":'USA',
                            "London, UK":'UK',
                            "Atlanta, GA":'USA',
                            "England, United Kingdom":'UK',
                            "Mumbai, India":'India',
                            "Melbourne,Victoria":'Australia'},inplace=True)
sns.barplot(y=df['location'].value_counts()[:10].index,x=df['location'].value_counts()[:10])

In [None]:
# Checking the text data
df['text'][:5]

In [None]:
# Applying a first round of text cleaning techniques

def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

# Applying the cleaning function to both test and training datasets
df['text'] = df['text'].apply(lambda x: clean_text(x))

# Let's take a look at the updated text
df['text'].head()

Just for fun let's create a wordcloud of the clean text to see the most dominating words in the tweets.

In [None]:
from wordcloud import WordCloud
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[26, 8])
wordcloud1 = WordCloud( background_color='white',
                        width=600,
                        height=400).generate(" ".join(disaster_tweets))
ax1.imshow(wordcloud1)
ax1.axis('off')
ax1.set_title('Disaster Tweets',fontsize=40);

wordcloud2 = WordCloud( background_color='white',
                        width=600,
                        height=400).generate(" ".join(nondisaster_tweets))
ax2.imshow(wordcloud2)
ax2.axis('off')
ax2.set_title('Non Disaster Tweets',fontsize=40);

In [None]:
text = "Are you coming , aren't you"
tokenizer1 = nltk.tokenize.WhitespaceTokenizer()
tokenizer2 = nltk.tokenize.TreebankWordTokenizer()
tokenizer3 = nltk.tokenize.WordPunctTokenizer()
tokenizer4 = nltk.tokenize.RegexpTokenizer(r'\w+')

print("Example Text: ",text)
print("------------------------------------------------------------------------------------------------")
print("Tokenization by whitespace:- ",tokenizer1.tokenize(text))
print("Tokenization by words using Treebank Word Tokenizer:- ",tokenizer2.tokenize(text))
print("Tokenization by punctuation:- ",tokenizer3.tokenize(text))
print("Tokenization by regular expression:- ",tokenizer4.tokenize(text))

In [None]:
#Tokenize the dataset
tokenizer=nltk.tokenize.RegexpTokenizer(r'\w+')
df['text']=df['text'].apply(lambda x:tokenizer.tokenize(x))
df['text'].head()

3. Stopwords Removal¶

Now, let's get rid of the stopwords i.e words which occur very frequently but have no possible value like a, an, the, are etc.

In [None]:
def remove_stopwords(text):
    """
    Removing stopwords belonging to english language
    
    """
    words = [w for w in text if w not in stopwords.words('english')]
    return words


df['text'] = df['text'].apply(lambda x : remove_stopwords(x))
df.head()

Token normalization

Token normalisation means converting different tokens to their base forms. This can be done either by:

Stemming : removing and replacing suffixes to get to the root form of the word, which is called the stem for instance cats - cat, wolves - wolv

Lemmatization : Returns the base or dictionary form of a word, which is known as the lemma

In [None]:
# Stemming and Lemmatization examples
text = "feet cats wolves talked"

tokenizer = nltk.tokenize.TreebankWordTokenizer()
tokens = tokenizer.tokenize(text)

# Stemmer
stemmer = nltk.stem.PorterStemmer()
print("Stemming the sentence: ", " ".join(stemmer.stem(token) for token in tokens))

# Lemmatizer
lemmatizer=nltk.stem.WordNetLemmatizer()
print("Lemmatizing the sentence: ", " ".join(lemmatizer.lemmatize(token) for token in tokens))

It is important to note here that stemming and lemmatization sometimes donot necessarily improve results as at times we donot want to trim words but rather preserve their original form. Hence their usage actually differs from problem to problem. For this problem, I will not use these techniques

In [None]:
# After preprocessing, the text format
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

df['text'] = df['text'].apply(lambda x : combine_text(x))
df.head()

Getting it all together- A Text Preprocessing Function

This concludes the pre-processing part. It will be prudent to convert all the steps undertaken into a function for better reusability.

In [None]:
# text preprocessing function
def text_preprocessing(text):
    """
    Cleaning and parsing the text.

    """
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    
    nopunc = clean_text(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    remove_stopwords = [w for w in tokenized_text if w not in stopwords.words('english')]
    combined_text = ' '.join(remove_stopwords)
    return combined_text

In [None]:
# Splitting the data into independent and dependent features
X=df['text']
y=df['target']

In [None]:
X.head()

In [None]:
X_train,X_test,y_train,y_test=model_selection.train_test_split(X,y,test_size=0.2,random_state=1)
vectorizer=CountVectorizer()
x_train_vectors=vectorizer.fit_transform(X_train)
x_test_vectors=vectorizer.transform(X_test)

In [None]:
X_train.head()

In [None]:
x_train_vectors.todense()

In [None]:
tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))
train_tfidf = tfidf.fit_transform(X_train)
test_tfidf = tfidf.transform(X_test)

In [None]:
train_tfidf

In [None]:
test_tfidf

In [None]:
# Fitting a simple Logistic Regression on Counts
clf = LogisticRegression(C=0.5,random_state=1,class_weight={0:1,1:5})
clf.fit(x_train_vectors,y_train)
pred=clf.predict(x_test_vectors)
accuracy_score_train=metrics.accuracy_score(y_train,clf.predict(x_train_vectors))
print(accuracy_score_train)
accuracy_score_test=metrics.accuracy_score(y_test,pred)
print(accuracy_score_test)
classification_report_train=metrics.classification_report(y_train,clf.predict(x_train_vectors))
print(classification_report_train)
classification_report_test=metrics.classification_report(y_test,pred)
print(classification_report_test)
roc_auc_score_train=metrics.roc_auc_score(y_train,clf.predict(x_train_vectors))
print(roc_auc_score_train)
roc_auc_score_test=metrics.roc_auc_score(y_test,pred)
print(roc_auc_score_test)
confusion_matrix_train=metrics.confusion_matrix(y_train,clf.predict(x_train_vectors))
print(confusion_matrix_train)
confusion_matrix_test=metrics.confusion_matrix(y_test,pred)
print(confusion_matrix_test)

In [None]:
# Fitting A Logistic Regression model on TF-IDF
clf_tfidf = LogisticRegression(C=1.0,random_state=1,class_weight={0:1,1:5})
clf_tfidf.fit(train_tfidf,y_train)
pred=clf_tfidf.predict(test_tfidf)
accuracy_score_train=metrics.accuracy_score(y_train,clf_tfidf.predict(train_tfidf))
print(accuracy_score_train)
accuracy_score_test=metrics.accuracy_score(y_test,pred)
print(accuracy_score_test)
classification_report_train=metrics.classification_report(y_train,clf_tfidf.predict(train_tfidf))
print(classification_report_train)
classification_report_test=metrics.classification_report(y_test,pred)
print(classification_report_test)
confusion_matrix_train=metrics.confusion_matrix(y_train,clf_tfidf.predict(train_tfidf))
print(confusion_matrix_train)
confusion_matrix_test=metrics.confusion_matrix(y_test,pred)
print(confusion_matrix_test)
roc_auc_score_train=metrics.roc_auc_score(y_train,clf_tfidf.predict(train_tfidf))
print(roc_auc_score_train)
roc_auc_score_test=metrics.roc_auc_score(y_test,pred)
print(roc_auc_score_test)

# Naives Bayes Classifier

Well, this is a decent score. Let's try with another model that is said to work well with text data : Naive Bayes.

In [None]:
clf_naive=MultinomialNB(alpha=0.2,fit_prior=False)
clf_naive.fit(x_train_vectors,y_train)
pred=clf_naive.predict(x_test_vectors)
accuracy_score_train=metrics.accuracy_score(y_train,clf_naive.predict(x_train_vectors))
print(accuracy_score_train)
accuracy_score_test=metrics.accuracy_score(y_test,pred)
print(accuracy_score_test)
classification_report_train=metrics.classification_report(y_train,clf_naive.predict(x_train_vectors))
print(classification_report_train)
classification_report_test=metrics.classification_report(y_test,pred)
print(classification_report_test)
roc_auc_score_train=metrics.roc_auc_score(y_train,clf_naive.predict(x_train_vectors))
print(roc_auc_score_train)
roc_auc_score_test=metrics.roc_auc_score(y_test,pred)
print(roc_auc_score_test)
confusion_matrix_train=metrics.confusion_matrix(y_train,clf_naive.predict(x_train_vectors))
print(confusion_matrix_train)
confusion_matrix_test=metrics.confusion_matrix(y_test,pred)
print(confusion_matrix_test)

In [None]:
clf_naive_tf=MultinomialNB(alpha=0.2,fit_prior=False)
clf_naive_tf.fit(train_tfidf,y_train)
pred=clf_naive_tf.predict(test_tfidf)
accuracy_score_train=metrics.accuracy_score(y_train,clf_naive_tf.predict(train_tfidf))
print(accuracy_score_train)
accuracy_score_test=metrics.accuracy_score(y_test,pred)
print(accuracy_score_test)
classification_report_train=metrics.classification_report(y_train,clf_naive_tf.predict(train_tfidf))
print(classification_report_train)
classification_report_test=metrics.classification_report(y_test,pred)
print(classification_report_test)
roc_auc_score_train=metrics.roc_auc_score(y_train,clf_naive_tf.predict(train_tfidf))
print(roc_auc_score_train)
roc_auc_score_test=metrics.roc_auc_score(y_test,pred)
print(roc_auc_score_test)
confusion_matrix_train=metrics.confusion_matrix(y_train,clf_naive_tf.predict(train_tfidf))
print(confusion_matrix_train)
confusion_matrix_test=metrics.confusion_matrix(y_test,pred)
print(confusion_matrix_test)