In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [14]:
!pip install contractions
import re, unicodedata, contractions, collections
import numpy as np 
import pandas as pd 
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from nltk import FreqDist
import seaborn as sns

In [15]:
df = pd.read_csv('/kaggle/input/natural-language-processing-with-disaster-tweets/kaggle nlp/train.csv')
df.head()

# Preliminary Analysis of Data

In [None]:
# Checking for null values 
df.isnull().sum()

In [None]:
# For figuring out how many null values are in which category 
temp_df = pd.DataFrame(data = df['target'][df.keyword.isnull() == True].value_counts().values, columns = ['keyword'],index = df.target.value_counts().index)
temp_df['location'] = df['target'][df.location.isnull() == True].value_counts().values
temp_df.plot.bar()

In [None]:
# getting the non null values 
text = df.loc[df.location.notnull(), 'location'].values 
text = " ".join(i for i in text) 
stopwords = set(STOPWORDS)

# creating a wordcloud for visualization
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)
plt.figure( figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# Creating a word cloud from frequencies 

# creating a dictionary of keys and frequency count using value_counts
values = df['keyword'].value_counts().keys().tolist()
counts = df['keyword'].value_counts().tolist()
freq = dict(zip(values, counts))

# Feeding it to wordcloud
wordcloud = wordcloud.generate_from_frequencies(freq)
plt.figure( figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# Function to figure out if a keyword actually exist in the text column
def match(x,y):
    x = x.lower()
    if pd.isnull(y):
        y = 'aleeewooo'
    else :
        y = y.lower()
    if x.find(y) != -1:
        return 1
    else:
        return 0
    
# making a new match column in the df
temp = []
for i in range(df.shape[0]):
    temp.append(match(df.text[i], df.keyword[i]))

df['match'] = temp

# Cases where keyword exist but there is no match, replacing keywords with nan values 
df['keyword'][(df.match == 0) & (df.keyword.notnull())] = np.nan

# dropping match column 
df.drop('match', axis = 1, inplace = True)

In [None]:
# length of each word
df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))

# unique words in each text 
df['unique_words'] = df['text'].apply(lambda x : len(set(str(x).split())))

# stopwords in each text 
df['stopword_count'] = df['text'].apply(lambda x : len([word for word in str(x).split() if word in STOPWORDS]))

# char len in each text 
df['char_len'] = df['text'].apply(lambda x : len(str(x)))

# this can also be done for punctuations, mentions etc. 

In [None]:
features = ['word_count','unique_words', 'stopword_count', 'char_len']

In [None]:
fig, axes = plt.subplots(ncols=1, nrows=len(features), figsize=(8, 10), dpi=100)

for i, fea in enumerate(features):
    sns.distplot(df[df.target == 1][fea], ax=axes[i], label = 'Disaster',color='red')
    sns.distplot(df[df.target == 0][fea], ax=axes[i], label = 'Not Disaster',color='blue')
    axes[i].legend()
    axes[i].tick_params(axis='x', labelsize=8)
    axes[i].tick_params(axis='y', labelsize=8)
    
    axes[i].set_title(f'{fea} Target Distribution in Training Set', fontsize=8)

fig.tight_layout()
plt.show()

    


# Using a pretrained Glove word embedding for finding incorrect words

In [None]:
# used to delete output zip files
import os
os.remove("/kaggle/working/glove.6B.zip")


In [43]:
# Using a pretrained glove embedding

!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip "/kaggle/working/glove.6B.zip" -d "/content/"

In [44]:
# taking out the embedding dictionary for glove 

emmbed_dict = {}
with open('/content/glove.6B.200d.txt','r') as f:
  for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:],'float32')
    emmbed_dict[word]=vector


In [45]:
# making a list of all words that dont exist in the glove corpus.
wrong_words = {}
for text in df.text:
    for word in text.split():
        if (word.lower() in wrong_words) and (word.lower() not in emmbed_dict.keys()):
            wrong_words[word.lower()] += 1 
        if (word.lower() not in wrong_words) and (word.lower() not in emmbed_dict.keys()):
            wrong_words[word.lower()] = 1 

# sorting by count             
wrong_words = sorted(wrong_words.items(), key=lambda x: x[1], reverse = True)

In [46]:
print('Total length of the words not in corpus of GLOVE pretrained model %d.'%(len(wrong_words)))

In [127]:
# plotting the top 100 wrong words with their count
x = list(dict(wrong_words).keys())[:100]
y = list(dict(wrong_words).values())[:100]

fig, axes = plt.subplots(figsize=(8, 20), dpi=100)
sns.barplot(x = y,  y = x)
axes.tick_params(axis = 'y', labelsize = 8)
  

# Text Preprocessing

In [47]:
# turning all the text to smaller word -- Sómě becomes Some
def small_word(x):
    text = ' '.join([word.lower() for word in x.split()])
    return text

df.text = df.text.apply(small_word)

In [48]:
# Removing all texts with urls 
def remove_URL(x):
    text = ' '.join([re.sub(r'https?://\S+|www\.\S+', "", word) for word in x.split()])
    return text

df.text = df.text.apply(remove_URL)

In [49]:
# replace special letters with normal letters 
def remove_accented_chars(text):
    new_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return new_text

df.text = df.text.apply(remove_accented_chars)

In [50]:
# removing all special characters(non english letters also), @, emoji etc
def rem_ch(x):
    x = ' '.join([re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", word) for word in x.split()])
    return x 

df.text = df.text.apply(rem_ch)

#function to tokenize 
def tok(x):
    x = word_tokenize(x)
    return x 

#tokenizing the column
df['text'] = df['text'].apply(tok)

In [51]:
# Getting rid of the stopwords 
def stopword(x):
    text = ' '.join([word.lower() for word in x.split() if word.lower() not in (stopwords.words('english'))])
    return text

# Applying it to data 
df.text = df.text.apply(stopword)

#### function to stem 
def stem(x):
    stemmer = LancasterStemmer()
    stemmed_words = [stemmer.stem(word) for word in x]
    return stemmed_words

#### applying it to data
df.text = df.text.apply(stem)

In [52]:
# function to lemmatize
def lemm(x):
    lemmatizer = WordNetLemmatizer()
    lemm_words = ' '.join([lemmatizer.lemmatize(word) for word in x.split()])
    return lemm_words

# applying it to data
df.text = df.text.apply(lemm)

## Incorrect words analysis after text preprocessing

In [53]:
# making a list of all words that dont exist in the glove corpus.
wrong_words_2 = {}
for text in df.text:
    for word in text.split():
        if (word in wrong_words_2) and (word not in emmbed_dict.keys()):
            wrong_words_2[word] += 1 
        if (word not in wrong_words_2) and (word not in emmbed_dict.keys()):
            wrong_words_2[word] = 1 

# sorting by count             
wrong_words_2 = sorted(wrong_words_2.items(), key=lambda x: x[1], reverse = True)

In [69]:
print('Total length of the words not in corpus of GLOVE pretrained model after changes made ' + str(len(wrong_words_2)) +
     '. Previously, it was ' + str(len(wrong_words)) + '.')

In [None]:
# Word Cloud for the text 
n_text = " ".join(i for i in df.text) 


wordcloud = WordCloud(background_color="white").generate(n_text)
plt.figure( figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# Making a dictonary with word frequency
W = n_text.split()
fdist = FreqDist(W)

In [73]:
# Tfidf vectorization 

from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer()
X = v.fit_transform(df['text'])
y = df.target.values

In [74]:
# splitting the data
X_train, X_test, y_train, y_test = train_test_split(X.toarray(),y, test_size=0.2, random_state= 42)

In [75]:
# training on XGBoost 

model = XGBClassifier()
model.fit(X_train, y_train)

In [76]:
y_pred = model.predict(X_test)

In [77]:
# This version leads to f1 score of 0.77
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
