# NLP with Disaster Tweets

Twitter has become an important communication channel in times of emergency.
The ubiquitousness of smartphones enables people to announce an emergency they’re observing in real-time. Because of this, more agencies are interested in programatically monitoring Twitter (i.e. disaster relief organizations and news agencies).

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from pylab import rcParams
rcParams['figure.figsize'] = 12,8
#sns.color_palette("hls", 8)

In [None]:
train = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
Id = test.id

In [None]:
train.shape #there are 7613 rows and 5 columns in the data

In [None]:
train.head() # 1 is real disaster tweets 0 is fake ones

In [None]:
train.info() 

In [None]:
train.isnull().sum()

In [None]:
len(set(train['id'])) # There are 7613 unique users. Each tweet was tweeted by unique users

In [None]:
sns.countplot(train['target'])
plt.show()
print(train['target'].value_counts())

# Keywords

In [None]:
sns.countplot(y = train.keyword,order = train['keyword'].value_counts().sort_values(ascending=False).iloc[0:20].index)
plt.title("Count of Keywords")
plt.show() # It shows the most usage keywords 

In [None]:
# Count of keywords for real disaster;
disastered_tweet = train.groupby('keyword')['target'].mean().sort_values(ascending=False).head(15)
non_disasterd  = train.groupby('keyword')['target'].mean().sort_values().head(15)

plt.figure(figsize=(8,5))
sns.barplot(disastered_tweet, disastered_tweet.index, color='red')
plt.title('Keywords with highest % of disaster tweets')
plt.show()

In [None]:
#Count of eywords for Non-Disasters
plt.figure(figsize=(8,5))
sns.barplot(non_disasterd, non_disasterd.index, color='blue')
plt.title('Keywords with lowest % of disaster tweets')
plt.show()

# Locations

In [None]:
sns.countplot(y = train.location,order = train['location'].value_counts().sort_values(ascending=False).iloc[0:15].index)

* As you can see the data is not clean. We need to do data cleansing. For example Us,USA and United State are same location.We need to seperate them 

* Let's see which location has the most disaster tweets

In [None]:
raw_loc = train.location.value_counts()
top_loc_disaster = list(raw_loc[raw_loc>=10].index)
top_only_disaster = train[train.location.isin(top_loc_disaster)]

top_location = top_only_disaster.groupby('location')['target'].mean().sort_values(ascending=False)
sns.barplot(x=top_location.index, y=top_location)
plt.xticks(rotation=90)
plt.show()

* Mumbai and India have the most disaster tweets. But we can see, there lots of noise in the location feature. We need to fix that 

In [None]:
# We need to fill null values with None
for i in ['keyword','location']:
    train[i] = train[i].fillna('None')
    test[i] = test[i].fillna('None')
    

In [None]:
train.info() #As we can see there is no null value now

In [None]:
len(set(train['location'])) #There are 3342 unique location values. we are going to decrease of that numbers as using data cleaning

In [None]:
def clean_location(x):
    if x == 'None':
        return 'None'
    elif x == 'Earth' or x =='Worldwide' or x == 'Everywhere':
        return 'World'
    elif 'New York' in x or 'NYC' in x:
        return 'New York'    
    elif 'London' in x:
        return 'London'
    elif 'Mumbai' in x:
        return 'Mumbai'
    elif 'Washington' in x and 'D' in x and 'C' in x:
        return 'Washington DC'
    elif 'San Francisco' in x:
        return 'San Francisco'
    elif 'Los Angeles' in x:
        return 'Los Angeles'
    elif 'Seattle' in x:
        return 'Seattle'
    elif 'Chicago' in x:
        return 'Chicago'
    elif 'Toronto' in x:
        return 'Toronto'
    elif 'Sacramento' in x:
        return 'Sacramento'
    elif 'Atlanta' in x:
        return 'Atlanta'
    elif 'California' in x:
        return 'California'
    elif 'Florida' in x:
        return 'Florida'
    elif 'Texas' in x:
        return 'Texas'
    elif 'United States' in x or 'USA' in x:
        return 'USA'
    elif 'United Kingdom' in x or 'UK' in x or 'Britain' in x:
        return 'UK'
    elif 'Canada' in x:
        return 'Canada'
    elif 'India' in x:
        return 'India'
    elif 'Kenya' in x:
        return 'Kenya'
    elif 'Nigeria' in x:
        return 'Nigeria'
    elif 'Australia' in x:
        return 'Australia'
    elif 'Indonesia' in x:
        return 'Indonesia'
    elif x in top_location:
        return x
    else: 
        return 'Others'
    
train['location'] = train['location'].apply(lambda x: clean_location(str(x)))
test['location'] = test['location'].apply(lambda x: clean_location(str(x)))

In [None]:
top_location = train.groupby('location')['target'].mean().sort_values(ascending=False)
plt.figure(figsize=(14,6))
sns.barplot(x=top_location.index, y=top_location)
plt.xticks(rotation=90)
plt.show()

* It looks, the data clean anymore. Now, Mumbai and Nigeria have the most disasters tweets.

In [None]:
len(set(train['location'])) # As we can see, the unique values decreased. It has 27 now.

# Text

In [None]:
# Let's look at the rondom tweets. 
train['text'][0]

# As wee can see there is a hashtag(#) in that tweet. We can split the hashtag and can use as a new feature
# let's look at another random tweet 

In [None]:
train['text'][789] 

# There is a tagged in that tweet. We can also split thatn and we can use as a new feature


In [None]:
train['text'][417] # and in that tweet. there is a link.  we are gonna fix all those tweets

In [None]:
import re

# We are going to split the hashtag,link and tagged
def created_feature(train):
    train['hashtags'] = train['text'].apply(lambda x: " ".join([match.group(0)[1:] for match in re.finditer(r"#\w+", x)]) or 'no_hashtag')
    train['tagged'] = train['text'].apply(lambda x: " ".join([match.group(0)[1:] for match in re.finditer(r"@\w+", x)]) or 'no_tagged')
    train['link'] = train['text'].apply(lambda x:" ".join([match.group(0)[:] for match in re.finditer(r"https?://\S+", x)]) or 'no_link')
    return train

In [None]:
train = created_feature(train)
test = created_feature(test)

In [None]:
train # As we can see, we have new features now. Great!

In [None]:
train['hashtags'].value_counts().sort_values(ascending=False).iloc[0:10]

In [None]:
train['tagged'].value_counts().sort_values(ascending=False).iloc[0:10]

In [None]:
train['link'].value_counts().sort_values(ascending=False).iloc[0:10]

In [None]:
def clean_text(text):
    text = re.sub(r'https?://\S+', '', text) # remove links
    text = re.sub(r'\n',' ', text) #  remove breaks
    text = re.sub('\s+', ' ', text).strip() 
    return text

In [None]:
train['text'][417] # Let's look at that sample

In [None]:
clean_text(train['text'][417]) # as we can see we cleaned the tweet
# We are gonna use of that method for all text 

In [None]:
train['text'] = train['text'].apply(lambda x: clean_text(x))
test['text'] = test['text'].apply(lambda x: clean_text(x))

In [None]:
train.head()

In [None]:
## Text Mining
import nltk
#nltk.download("stopwords")
#!pip install textblob
#nltk.download("wordnet")

#Upper lower convert
train['text'] = train['text'].apply(lambda x:" ".join(x.lower() for x in x.split()))
test['text'] = test['text'].apply(lambda x:" ".join(x.lower() for x in x.split()))

# punctuation marks
train['text'] =train['text'].str.replace('[^\w\s]','')
test['text'] =test['text'].str.replace('[^\w\s]','')

# numbers
train['text'] = train['text'].str.replace('[\d]','')
test['text'] = test['text'].str.replace('[\d]','')

from nltk.corpus import stopwords
sw = stopwords.words('english')
train['text'] =train['text'].apply(lambda x: " ".join(x for x in x.split() if x not in sw))
test['text'] =test['text'].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

#lemmi 
from textblob import Word
train['text'] = train['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
test['text'] = test['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))


train['text'] = train['text'].str.replace('rt','')
test['text'] = test['text'].str.replace('rt','')

In [None]:
train.text # we did some cleaning to text 

In [None]:
freq_df = train['text'].apply(lambda x:pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
freq_df.columns = ['words', 'frequences']
freq_df.sort_values('frequences',ascending=False) # It shows frequences of words 

In [None]:
# Most used words
top_freq = freq_df.sort_values('frequences',ascending=False)[0:15]
top_freq.set_index('words',inplace=True)
top_freq.plot.bar(color=(0.2, 0.4, 0.6, 0.6))

In [None]:
#Most used words dor disasters
freq_df = train[train['target']==1]['text'].apply(lambda x:pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
freq_df.columns = ['words', 'frequences']
freq_df.sort_values('frequences',ascending=False)

top_freq_disaster = freq_df.sort_values('frequences',ascending=False)[0:15]
top_freq_disaster.set_index('words',inplace=True)
top_freq_disaster.plot.bar(color ='g')
plt.title("Disaster Tweets")
plt.show()  #Fire and news are most used words in the disasters tweets.

In [None]:
# Most used words for Non-Disaster tweets
freq_df = train[train['target']==0]['text'].apply(lambda x:pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
freq_df.columns = ['words', 'frequences']
freq_df.sort_values('frequences',ascending=False)

top_freq_non_disaster = freq_df.sort_values('frequences',ascending=False)[0:15]
top_freq_non_disaster.set_index('words',inplace=True)
top_freq_non_disaster.plot.bar(color ='orange')
plt.title("Non-Disaster Tweets")
plt.show() #

In [None]:
import sys
import numpy as np
import pandas as pd
from os import path
from PIL import Image
from wordcloud import WordCloud , STOPWORDS, ImageColorGenerator

In [None]:
# I'm keeping the all tweets in ne text to do word cloud

text = " ".join(i for i in train.text)

In [None]:
text[0:1000] 

In [None]:
wc = WordCloud(background_color='white').generate(text)
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad = 0)
plt.show()

# Feature Engineering

* count vectors
* TF-IDF vectors(words, chracters, n-grams)


TF (t) = (Frequency of a term in a document) / (total number of terms in a document)

IDF (t) = log_e (Total number of documents) / (number of documents with t terms in it)

In [None]:
from textblob import TextBlob
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

# Count Vectors

In [None]:
import category_encoders as ce

# Target encoding
features = ['keyword', 'location']
encoder = ce.TargetEncoder(cols=features)
encoder.fit(train[features],train['target'])

train = train.join(encoder.transform(train[features]).add_suffix('_target'))
test = test.join(encoder.transform(test[features]).add_suffix('_target'))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# CountVectorizer

# Links
vec_links = CountVectorizer(min_df = 5, analyzer = 'word', token_pattern = r'https?://\S+') # Only include those >=5 occurrences
link_vec = vec_links.fit_transform(train['link'])
link_vec_test = vec_links.transform(test['link'])
X_train_link = pd.DataFrame(link_vec.toarray(), columns=vec_links.get_feature_names())
X_test_link = pd.DataFrame(link_vec_test.toarray(), columns=vec_links.get_feature_names())

# Tagged
vec_tag = CountVectorizer(min_df = 5)
tag_vec = vec_tag.fit_transform(train['tagged'])
tag_vec_test = vec_tag.transform(test['tagged'])
X_train_tag = pd.DataFrame(tag_vec.toarray(), columns=vec_tag.get_feature_names())
X_test_tag = pd.DataFrame(tag_vec_test.toarray(), columns=vec_tag.get_feature_names())

# Hashtags
vec_hash = CountVectorizer(min_df = 5)
hash_vec = vec_hash.fit_transform(train['hashtags'])
hash_vec_test = vec_hash.transform(test['hashtags'])
X_train_hash = pd.DataFrame(hash_vec.toarray(), columns=vec_hash.get_feature_names())
X_test_hash = pd.DataFrame(hash_vec_test.toarray(), columns=vec_hash.get_feature_names())

# TF-IDF

In [None]:
# Tf-idf for text
from sklearn.feature_extraction.text import TfidfVectorizer

vec_text = TfidfVectorizer(min_df = 10, ngram_range = (1,2), stop_words='english') 
text_vec = vec_text.fit_transform(train['text'])
text_vec_test = vec_text.transform(test['text'])
X_train_text = pd.DataFrame(text_vec.toarray(), columns=vec_text.get_feature_names())
X_test_text = pd.DataFrame(text_vec_test.toarray(), columns=vec_text.get_feature_names())
print (X_train_text.shape)

In [None]:
train = train.join(X_train_link, rsuffix='_link')
train = train.join(X_train_tag, rsuffix='_tagged')
train = train.join(X_train_hash, rsuffix='_hashtag')
train = train.join(X_train_text, rsuffix='_text')

test = test.join(X_test_link, rsuffix='_link')
test = test.join(X_test_tag, rsuffix='_mention')
test = test.join(X_test_hash, rsuffix='_hashtag')
test = test.join(X_test_text, rsuffix='_text')

print (train.shape)

In [None]:
train.head() # as we can see the data has 1708 feature now

# Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y = train_test_split(train.drop(columns = ['id', 'keyword', 'location', 'text', 
                                                                       'target', 'hashtags', 'tagged','link']),
                                                 train['target'],test_size = 0.3) 


In [None]:
train_x.shape

In [None]:
test_x.shape

# Logistic Regression

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn import linear_model
log = linear_model.LogisticRegression(solver='liblinear', random_state=777)
log_model = log.fit(train_x, train_y)
log_pred = log_model.predict(test_x)
accuracy = model_selection.cross_val_score(log_model,
                                          test_x,
                                          test_y,
                                          cv=10).mean()

print('Accuracy of Logistic Regression: ', accuracy)

In [None]:
confusion_matrix(test_y,log_pred)

# Navie-Bayes 

In [None]:
nb= naive_bayes.MultinomialNB()
nb_model = nb.fit(train_x, train_y)
nb_pred = nb_model.predict(test_x)
accuracy = model_selection.cross_val_score(nb_model,
                                          test_x,
                                          test_y,
                                          cv=10).mean()
print('Accuracy of Naive-Bayes: ', accuracy)

In [None]:
confusion_matrix(test_y,nb_pred)

# Random Forest Regression

In [None]:
rf = ensemble.RandomForestClassifier()
rf_model = rf.fit(train_x,train_y)
rf_pred = rf.predict(test_x)
accuracy = model_selection.cross_val_score(rf_model,
                                          test_x,
                                          test_y,
                                          cv=10).mean()
print('Accuracy of Random Forest: ', accuracy)

In [None]:
confusion_matrix(test_y,rf_pred) 

# XGBoost

In [None]:
import xgboost
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(train_x,train_y)
xgb_pred = xgb_model.predict(test_x)
accuracy = model_selection.cross_val_score(xgb_model,
                                          test_x,
                                          test_y,
                                          cv=10).mean()
print('Accuracy of XGBoost: ', accuracy)

In [None]:
confusion_matrix(test_y,xgb_pred)

# Submission

In [None]:
columns = train_x.columns

In [None]:
columns

In [None]:
test = test.reindex(columns = columns, fill_value=0)

In [None]:
#nb_model = nb.fit(train.drop(columns = ['id', 'keyword', 'location', 'text','target', 'hashtags', 'tagged','link']),train['target'])

In [None]:
pred = nb_model.predict(test)
submission = pd.DataFrame({"id": Id, "target": pred})
submission.to_csv("submission.csv", index=False)



If you like it please vote !