In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import nltk
import re
import gc
import re
import string
import operator

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import STOPWORDS, WordCloud
from collections import defaultdict

from plotly import tools
import plotly.graph_objs as go
import plotly.offline as py
py.init_notebook_mode(connected=True)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

SEED = 42

%matplotlib inline

In [3]:
!ls

data  disaster_or_not.ipynb  sentiment_analysis.ipynb


In [4]:
!ls data/

disaster_or_not  sentiment_analysis


In [5]:
!ls data/disaster_or_not/

sample_submission.csv  test.csv  train.csv


<ul>
    <li>train.csv - the training set</li>
    <li>test.csv - the test set</li>
    <li>sample_submission.csv - A sample submission in the correct format</li>
</ul>

In [None]:
df_train = pd.read_csv('data/disaster_or_not/train.csv', dtype={'id': np.int16, 'target': np.int8})
df_test = pd.read_csv('data/disaster_or_not/test.csv', dtype={'id': np.int16})

print('Training Set Shape = {}'.format(df_train.shape))
print('Training Set Memory Usage = {:.2f} MB'.format(df_train.memory_usage().sum() / 1024**2))
print('Test Set Shape = {}'.format(df_test.shape))
print('Test Set Memory Usage = {:.2f} MB'.format(df_test.memory_usage().sum() / 1024**2))

In [None]:
df_train.head()

# <b>Target Distribution:</b>

First let us look at the distribution of the target variable to understand more about the imbalance and so on.

In [None]:
## target count ##
cnt_srs = df_train['target'].value_counts()
trace = go.Bar(
    x=cnt_srs.index,
    y=cnt_srs.values,
    marker=dict(
        color=cnt_srs.values,
        colorscale = 'Picnic',
        reversescale = True
    ),
)

layout = go.Layout(
    title='Target Count',
    font=dict(size=18)
)

data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename="TargetCount")

In [None]:
## target distribution ##
labels = (np.array(cnt_srs.index))
sizes = (np.array((cnt_srs / cnt_srs.sum())*100))

trace = go.Pie(labels=labels, values=sizes)
layout = go.Layout(
    title='Target distribution',
    font=dict(size=18),
    width=600,
    height=600,
)
data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename="usertype")

So about 6% of the training data are insincere questions (target=1) and rest of them are sincere.

In [None]:
def plot_wordcloud(text, mask=None, max_words=200, max_font_size=100, figure_size=(24.0,16.0), 
                   title = None, title_size=40, image_color=False):
    stopwords = set(STOPWORDS)
    more_stopwords = {'one', 'br', 'Po', 'th', 'sayi', 'fo', 'Unknown'}
    stopwords = stopwords.union(more_stopwords)

    wordcloud = WordCloud(background_color='black',
                    stopwords = stopwords,
                    max_words = max_words,
                    max_font_size = max_font_size, 
                    random_state = 42,
                    width=800, 
                    height=400,
                    mask = mask)
    wordcloud.generate(str(text))
    
    plt.figure(figsize=figure_size)
    if image_color:
        image_colors = ImageColorGenerator(mask);
        plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear");
        plt.title(title, fontdict={'size': title_size,  
                                  'verticalalignment': 'bottom'})
    else:
        plt.imshow(wordcloud);
        plt.title(title, fontdict={'size': title_size, 'color': 'black', 
                                  'verticalalignment': 'bottom'})
    plt.axis('off');
    plt.tight_layout()  
    
plot_wordcloud(df_train["text"], title="Word Cloud of Questions")

# Keyword & Location

In [None]:
df_train.isnull().sum()

In [None]:
sns.set_style('whitegrid')
sns.countplot(x=df_train[['keyword', 'location']].isnull().sum(), data=df_train)

In [None]:
sns.set_style('whitegrid')
sns.countplot(x=df_test[['keyword', 'location']].isnull().sum(), data=df_train)

In [None]:
print(f'Number of unique values in keyword = {df_train["keyword"].nunique()} (Training) - {df_test["keyword"].nunique()} (Test)')
print(f'Number of unique values in location = {df_train["location"].nunique()} (Training) - {df_test["location"].nunique()} (Test)')

In [None]:
df_train['target_mean'] = df_train.groupby('keyword')['target'].transform('mean')

fig = plt.figure(figsize=(8, 72), dpi=100)

sns.countplot(y=df_train.sort_values(by='target_mean', ascending=False)['keyword'],
              hue=df_train.sort_values(by='target_mean', ascending=False)['target'])

plt.tick_params(axis='x', labelsize=15)
plt.tick_params(axis='y', labelsize=12)
plt.legend(loc=1)
plt.title('Target Distribution in Keywords')

plt.show()

df_train.drop(columns=['target_mean'], inplace=True)

# Meta Feature

Distributions of meta features in classes and sets can be helpful to identify disaster tweets. It looks like disaster tweets are written in a more formal way with longer words compared to non-disaster tweets because most of them are coming from news agencies. Non-disaster tweets have more typos than disaster tweets because they are coming from individual users. The meta features used for the analysis are;

- word_count number of words in text
- unique_word_count number of unique words in text
- stop_word_count number of stop words in text
- url_count number of urls in text
- mean_word_length average character count in words
- char_count number of characters in text
- punctuation_count number of punctuations in text
- hashtag_count number of hashtags (#) in text
- mention_count number of mentions (@) in text

In [None]:
# word_count
df_train['word_count'] = df_train['text'].apply(lambda x: len(str(x).split()))
df_test['word_count'] = df_test['text'].apply(lambda x: len(str(x).split()))

# unique_word_count
df_train['unique_word_count'] = df_train['text'].apply(lambda x: len(set(str(x).split())))
df_test['unique_word_count'] = df_test['text'].apply(lambda x: len(set(str(x).split())))

# stop_word_count
df_train['stop_word_count'] = df_train['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))
df_test['stop_word_count'] = df_test['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))

# url_count
df_train['url_count'] = df_train['text'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))
df_test['url_count'] = df_test['text'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))

# mean_word_length
df_train['mean_word_length'] = df_train['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
df_test['mean_word_length'] = df_test['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

# char_count
df_train['char_count'] = df_train['text'].apply(lambda x: len(str(x)))
df_test['char_count'] = df_test['text'].apply(lambda x: len(str(x)))

# punctuation_count
df_train['punctuation_count'] = df_train['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
df_test['punctuation_count'] = df_test['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

# hashtag_count
df_train['hashtag_count'] = df_train['text'].apply(lambda x: len([c for c in str(x) if c == '#']))
df_test['hashtag_count'] = df_test['text'].apply(lambda x: len([c for c in str(x) if c == '#']))

# mention_count
df_train['mention_count'] = df_train['text'].apply(lambda x: len([c for c in str(x) if c == '@']))
df_test['mention_count'] = df_test['text'].apply(lambda x: len([c for c in str(x) if c == '@']))

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df.shape

In [None]:
df_train.drop('id', axis=1, inplace=True)
df_train['text'] = df_train.text.str.lower()
df_train.head()

In [None]:
df.head()

In [None]:
df.keyword.isna()

In [None]:
df.keyword.unique()

In [None]:
df.loc[df['keyword'] == 'flooding', 'keyword', ] = 'flood'

In [None]:
df.keyword.unique()

In [None]:
df.loc[~df.keyword.isna()]

In [None]:
df.loc[df.location.isna(), 'text'][0:5].values

In [None]:
df.loc[df.location.isna(), 'location'][0:5].values

### Important note 
if we can find out city, state and country name from tweets then generted those location based on the the presence of it. If we don't find the locations then replace it with "unknown" word in location column

In [None]:
df.loc[df.location.isna(), 'location',] = 0
df.loc[~df.location.isna(), 'location',] = 1

df.head()

In [None]:
df.isnull().sum()

In [None]:
def check_location(query):
    cities = pd.read_csv('data/cities.csv')
    states = pd.read_csv('data/states.csv')
    countries = pd.read_csv('data/countries.csv')
    
    for word in query.split():
        if word in cities:
            location = word
        elif word in states:
            lcaotion = word
        elif word in countries:
            location = word
        else:
            location = 'unknown'
            
        return location

df['location'] = df['text'].apply(check_location)

In [None]:
df.head()

In [None]:
def sep_word_symbol(query):
    words = []
    for w in query.split(' '):
        word_tokens = re.findall(r'(([a-z0-9]+)([.#\"\'?;:])([a-z]+)?)', w)
        if len(word_tokens):
            w = ' '.join([tok for tok in word_tokens[0][1:]])
            words.append(w)
        else:
            words.append(w)
    
    return ' '.join([d for d in words])

In [None]:
df['text'] = df['text'].apply(sep_word_symbol)
df.head()

In [None]:
df.text.str.replace(',', '')

In [None]:
freq = pd.Series(' '.join(df['text']).split()).value_counts()
freq

In [None]:
df['text'] = df['text'].apply(lambda x: " ".join(w for w in x.split() if freq.get(w) > 5))

In [None]:
freq = pd.Series(' '.join(df['text']).split()).value_counts()
freq

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(df['text'], df['target'], 
                                                      test_size = 0.2, random_state = 4)

In [None]:
cv = CountVectorizer(stop_words='english', ngram_range=(1, 2))
X_traincv = cv.fit_transform(X_train)
X_validcv = cv.transform(X_valid)

In [None]:
cv.get_feature_names()

In [None]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(X_traincv, y_train)

In [None]:
# Testing accuracy
naive_bayes.score(X_validcv, y_valid)

In [None]:
# Traiining accuracy
naive_bayes.score(X_traincv, y_train)