In [None]:
#Disaster tweet binary classification

In [None]:
#import libraries

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix

import xgboost as xgb
import spacy

In [None]:
#Read the input files
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
        
#Training data
path = "/kaggle/input/nlp-getting-started/"
train_data = pd.read_csv(os.path.join(path, 'train.csv'))
test_data = pd.read_csv(os.path.join(path, 'test.csv'))

train_data.head()

# **EDA**

In [None]:
train_data.isna().sum()

In [None]:
#EDA

train_data = train_data.drop_duplicates(subset=['text', 'target'], keep='first')
train_data.shape

In [None]:
#Tweet char lengths in train and test
plt.hist(train_data.text.str.len(), bins=20, label='train')
plt.show()

plt.hist(test_data.text.str.len(), bins=20, label='test', alpha=0.5)
plt.show()

In [None]:
#Distribution of the target variable

plt.hist(train_data["target"])
plt.xticks([0, 1])
plt.show() 

In [None]:
#Keywords column

plt.figure(figsize=(20,10))
sns.barplot(y = train_data['keyword'].value_counts()[:30].index,
            x = train_data['keyword'].value_counts()[:30],
            orient='h')


In [None]:
#Non disaster tweet keywords
df = train_data[train_data.target==0]['keyword'].value_counts()

df.head(20)

In [None]:
train_data[(train_data.target==0) & (train_data.keyword=='explode')].head()

In [None]:
train_data[(train_data.target==0) & (train_data.keyword=='deluge')].head()

##Non diaster keywords also look similar to diaster keywords even though the tweet might be begnign.

##So this wont be a useful feature for the model and might lead to false positives.

##Dropping id, keyword and location (due to large no of NaNs)

In [None]:
train_x = train_data[['text']].copy()
train_y = train_data['target'].copy()


In [None]:
#Text cleaning and extraction
# - change to lower case
# - remove html
# - extract hashtags
# - remove special char
# - remove numbers/words with numbers
# - remove stop words
# - stemming
# - lemmatization

#Stopwords
stopw = set(stopwords.words('english'))
#Stopwords: keep negations
stopw = stopw.difference(
        ["won't", "aren't", 'nor', 'not', 'no', "isn't", "couldn't", "hasn't", "hadn't", "haven't",
         "didn't", "doesn't", "wouldn't", "can't"])
    
def text_process(text, stopw=None):
    
    text = text.lower()
    
    #remove html
    text = re.sub(r'http[s]\S+|www\S+', '', text)
    
    #remove numbers and punctuation
    text = re.sub(r'[^A-Za-z\s]+', '', text)
    
    #Stopwords removal
    text_list = [word for word in text.split() if word not in stopw]
    
    #Stemming the words
    port_stem = PorterStemmer()
    text_list = [port_stem.stem(word) for word in text_list]
                
    ## Lemmatisation to get the root word
    lem = WordNetLemmatizer()
    text_list = [lem.lemmatize(word) for word in text_list]

    ## back to string from list
    text = " ".join(text_list)
    return text

train_x['text'] = train_x['text'].apply(lambda x: text_process(x, stopw))


# **Create numerical features based on word counts in the tweet**

In [None]:
class ExtraAttributes(BaseEstimator, TransformerMixin):
    
    def __init__(self, add_frac_nonstop=True, add_avg_word_len=True, stopw=stopw):
        self.add_frac_nonstop = add_frac_nonstop
        self.add_avg_word_len = add_avg_word_len
        self.stopw = stopw


    def fit(self, X, y=None):
        return self
  
    def transform(self, X, y=None):
#         print(X.info())
        #Fraction of non stopwords
        X['frac_nonstop'] = X['text'].apply(
                lambda x: len([t for t in x.split() if t not in stopw])/len(x))

        #Average word length
        X['avg_word_len'] = X['text'].apply(
            lambda x: np.mean([len(t) for t in x.split() if t not in stopw]))
        X = X.drop(['text'], axis=1)
        return np.c_[X]
                                                

In [None]:
ExtraAttributes().fit_transform(train_x).shape

# **Using Spacy's pretrained large word 2 vec model for creating word embeddings**

Other approches can be explored as well:

 - TFIDF (simpler approach of using normalized word frequencies, context of the tweet might not be captured)
 - Word embedding approaches like Word2Vec, Glove and BERT preserve the context and hence perform better

In [None]:
nlp = spacy.load("en_core_web_lg")

class w2v_spacy(BaseEstimator, TransformerMixin):
    def __init__(self, nlp):
        self.nlp = nlp
        self.dim = 300

    def fit(self, X, y=None):
        return self

    def transform(self, X):
#         print(X.shape)
        return np.c_[[self.nlp(text).vector for text in X['text']]]

In [None]:

#Validation set stratified with target variable

X, val_x, y, val_y = train_test_split(train_x, train_y, test_size=0.1,
                                      stratify = train_y, 
                                      random_state=42)

In [None]:
len(val_y[val_y==0])/len(val_y)

In [None]:
len(y[y==0])/len(y)

# Pipline for feature preparation and classification

In [None]:
embeddings_pipeline = Pipeline([
        ("word2vec", w2v_spacy(nlp)),
        ("dim_reduce", TruncatedSVD(50)),
    ])

feature_prep = ColumnTransformer([
                                ('numerical', ExtraAttributes(), ['text']),
                                ('categorical', embeddings_pipeline, ['text'])
])

full_pipeline = Pipeline([
        ('feature_prep', feature_prep),
        ("classifier", RandomForestClassifier(random_state=42)),
    ])

In [None]:
# full_pipeline.fit(X, y)
# y_pred = full_pipeline.predict(val_x)
# cr = classification_report(val_y, y_pred)

In [None]:
#Creating pipeline for creating word embeddings for tweets, dimensional reduction and classification using Random Forest

embeddings_pipeline_test = Pipeline([
        ("word2vec", w2v_spacy(nlp)),
        ("dim_reduce", TruncatedSVD(50)),
        ("classifier", RandomForestClassifier(random_state=42)),
    ]
)
embeddings_pipeline_test.fit(X, y)
y_pred = embeddings_pipeline_test.predict(val_x)
cr = classification_report(val_y, y_pred)

In [None]:
y_pred = embeddings_pipeline_test.predict(X)
train_cr = classification_report(y, y_pred)
train_cr

In [None]:
print(train_cr)

In [None]:
print(cr)

# Random search of hyper parameters with Xgboost classifier

In [None]:

params = {
    "feature_prep__categorical__dim_reduce": ["passthrough", TruncatedSVD(20), TruncatedSVD(50)],
    "classifier__max_depth": [7], #, 11],
    "classifier__learning_rate": [0.1], # [1, 0.1, 0.01],
    "classifier__n_estimators": [50], #, 100],
}


full_pipeline = Pipeline([
        ('feature_prep', feature_prep),
        ("classifier", xgb.XGBClassifier(use_label_encoder=False, eval_metric='auc')),
    ])


%timeit
print("Searching..")
model = RandomizedSearchCV(full_pipeline, param_distributions=params, 
                           scoring='f1', n_iter=5, cv=3, verbose=8)
model.fit(X, y)

In [None]:
model.best_score_

In [None]:
# # refitting on entire training data using best params
# model.refit

y_pred = model.predict(val_x)
cr = classification_report(val_y, y_pred)
print(cr)

In [None]:
model.best_params_

In [None]:
confusion_matrix(val_y, y_pred)

# Summary:

 - Designed and implemented a pipeline to predict if a tweet is on a disaster
 - Data Cleaning:
     - Removed hyperlinks, punctuations, special chars and numbers from the tweet
     - Dropped id, location and keyword columns; as they were found to be not useful
 - Features used:
      - Embeddings based on pretrained word 2 vec for the tweets. Experimented with small, medium and large word2vec models of spacy, where large provides the best results, however it is very memory intensive
      - Truncated SVD used to reduce the dimensionality of the data
      - Numerical features based on the average word length in a tweet and fraction of non stopwords
 - Pipelines created for numerical features and text based features
 - Columntransformer used to concatenate the features
 - A random forecast classifier was trained on the training data
 - Also, a xgboost classifier was trained with random hyper parameter search where feature engineering params can be tune with the params grid
 - Performance of the best classifier from random search is evaluated using the validation set (created with stratified split)
 - Recall of the positive class (disaster) is found to be low, specially in case of the small word 2 vec model
 - Random forest model trained without cross validation is overfitting the train set, and performs poorly on the validation set
 - Xgboost model trained with cross validation with the parameter grid selected is underfit on the training set and hence the performance can be futher improved by increasing the tree depth, max leaves or n_estimators.
 
# Further improvements:

- More features can be created by extracing hashtags, url text, emojis and sentiments from the tweet
- Futher analysis of keyword column could allow for creation of a more useful feature based on it.
- Character embeddings can be generated for special characters
- Text can be vectorized with BERT which could possibly improve the performance by better capturing the contextual meaning in the tweet. Importance of embeddings is clear by the difference between small and large spacy word2vec models.
- Scope for better tuning of the Xgboost model
- A sequence based model like LSTM can be used which would learn the centextual meaning more accurately in the tweet