In [None]:
import numpy as np 
import pandas as pd
import os

'''
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
'''

In [None]:
# load datasets
# train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
# test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

train = pd.read_csv('Data/disaster_tweets_kaggle/train.csv')
test = pd.read_csv('Data/disaster_tweets_kaggle/test.csv')

In [None]:
train.head()

In [None]:
train.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
train.isnull().sum()

In [None]:
# let's check if the training data is balanced
sns.countplot(x = 'target', data = train)

In [None]:
# visualizing null values
sns.heatmap(train.isnull(),yticklabels = False)

There are too many null values for location therefore the column will be dropped.

In [None]:
train.drop(['id', 'location'], axis = 1, inplace = True)
train.info()

In [None]:
train.groupby('keyword').count()

Keywords can be helpful in classification, we will drop the rows with missing keywords as there are very few of them.

In [None]:
train.dropna(axis = 0, inplace = True)
train.reset_index(drop = True, inplace = True)
train.info()

In [None]:
train.tail(10)

In [None]:
# let's check the test data
test.info()

## Training Data Analysis

To decide what strategy to use for data processing and what type of a model to use, the training data must be thoroughly analyzed. Firstly, we will check if all keywords are related to disaster and formulate a method to identify which ones are more relevant with the help of the associated target value.

In [None]:
# create a series of unique keywords
pd.Series(train.groupby('keyword').count().index, name = 'keyword')

As most keywords seem to be asscoiated with disasters, we need to perform feature engineering to extract useful information from the distribution of these keywords with respect to the target class

### Text preprocessing using NLTK
Each keyword will be converted into its root form so that keyword like 'blood' and 'bloody' will be considered as the same keywords. Same goes for all keywords that are closely related in the same manner.

In [None]:
import nltk

In [None]:
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

In [None]:
# create a stemmer
stemmer = PorterStemmer()

In [None]:
# create a column with stemmed keyword
train['stemmed_keyword'] = pd.Series(stemmer.stem(keyword) for keyword in train['keyword'])

In [None]:
# returns a list of tokens given a sentence
def get_tokens(text):
    return word_tokenize(text)

# calculates the disaster association coefficient for a keyword
def get_disaster_association(keyword):
    
    # get the number of times the keyword is associated with a true disaster tweet
    target_0_count = (train[train['stemmed_keyword'] == keyword]['target'] == 0).sum()
    
    # get the number of times the keyword is not associated with a true disaster tweet
    target_1_count = (train[train['stemmed_keyword'] == keyword]['target'] == 1).sum()
    
    # association value = true association count / total
    return target_1_count/(target_0_count + target_1_count)

In [None]:
# create columns with the keyword's association to actual disaster tweeets
train['disaster_asc_coeff'] = train['stemmed_keyword'].apply(get_disaster_association)

In [None]:
train.head(20)

In [None]:
# save association oefficients as a dictionary for transformation
association_dict = train.groupby('stemmed_keyword')['association_coeff'].mean().sort_values(ascending = False).to_dict()

### Identifying Location mentions in Tweet
Mentions of a certain area, city or country in the tweet can be useful in identifying whether the tweet is related to disaster or as tweets with a location mentoned are more likely to be linked to disaster. To implement this we will use the <b>geography</b> package.

In [None]:
# install geotext 
# !pip install geotext

In [None]:
import geotext
from geotext import GeoText

### Text processing
Next, each text will be processed such that only important information will be retained. Foe example, hyperlinks and hashtag symbols will be removed

In [None]:
print(train['text'][7])
print(train['text'][8])
print(train['text'][9])
print(train['text'][10])
print(train['text'][11])

In [None]:
c1 = train['text'][5].split('http')[0].strip()
c2 = ''.join(c1.split('#'))
' '.join([word for word in c2.split() if '@' not in word])

In [None]:
def get_clean_text(raw_text):
    
    # remove hyperlink
    text_c1 = raw_text.split('http')[0].strip()
    
    # remove hashtags
    text_c2 = ''.join(text_c1.split('#'))
    
    # remove username
    text_c3 = ' '.join([word for word in wordtext_c2.split() if '@' not in word])
    
    # convert text to lowercase
    text_c4 = text_c3.lower()
    
    return text_c4

## Sentiment Analysis (NLTK)
Although keywords can be an important factor in this type of classification, the overall sentiment of the tweet can also provide crucial information regarding the nature of the tweet. Although a custom model can be trained to predict sentiment, for this project I am using the <b> NLTK VADER </b> library which relies on a rule based sentiment-analyzer.

In [None]:
nltk.download([
     "names",
     "stopwords",     "state_union",
     "twitter_samples",
    "movie_reviews",
    "averaged_perceptron_tagger",
     "vader_lexicon",
     "punkt",
 ])

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

In [None]:
# returns a dictionary of values
def get_sentiment_val(text):
    res_dict = sia.polarity_scores(text)
    return pd.Series([res_dict['neg'], res_dict['neu'], res_dict['pos']])

In [None]:
train[['neg', 'neu', 'pos']] = train['text'].apply(get_sentiment_val)

In [None]:
train.head()

In [None]:
# save processed training data to avoid preprocessing time
train.drop(['keyword', 'text',
            'stemmed_keyword'], axis = 1).to_csv('Data/disaster_tweets_kaggle/cleaned_train.csv', index = False)

In [None]:
# load data
df = pd.read_csv('Data/disaster_tweets_kaggle/cleaned_train.csv')

In [None]:
df.info()

Analyze distribution with respect to different attributes

### Distribution of keyword association coefficient

In [None]:
plt.figure(figsize = (8,4))
sns.kdeplot(df, x = 'association_coeff', hue = 'target')
plt.title('Distribution of keyword association coefficient w.r.t. target labels')
plt.grid()

The above KDE plot shows that distribution of keyword association coefficients for non disaster tweets is more towards the left as opposed to that of disaster related tweets.

### Distribution of different sentiment tags

In [None]:
plt.figure(figsize = (8,4))
sns.kdeplot(df, x = 'pos', hue = 'target')
plt.title('Distribution of tweets with positive sentiment w.r.t. target labels')
plt.grid()

In [None]:
plt.figure(figsize = (8,4))
sns.kdeplot(df, x = 'neu', hue = 'target')
plt.title('Distribution of tweets with neutral sentiment w.r.t. target labels')
plt.grid()

In [None]:
plt.figure(figsize = (8,4))
sns.kdeplot(df, x = 'neg', hue = 'target')
plt.title('Distribution of tweets with negative sentiment w.r.t. target labels')
plt.grid()

As we can clearly see that both targets have similar distribution for all sentiment tags given by NLTK's VADER sentiment analysis package, using these attributes for model training will not be too useful.

### Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# drop all columns except 'association coefficient' and those related to sentiment analysis
X = train_processed.drop('target', axis = 1)
y = train_processed['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [None]:
X_train.head()

### Model Selection
As this is a classification problem, the simplest model that can be used is logistic regression. However, logistic regression only performs well for if classes are linearly separable. For data that is not linearly separable we can use KNN, SVC, Random Forests. All these models will be implemented and the results for each will be compared.