# Twitter Disaster Tweets check

Twitter has become an important communication channel in times of emergency.The ubiquitousness of smartphones enables people to announce an emergency they’re observing in real-time. Because of this, more agencies are interested in programatically monitoring Twitter (i.e. disaster relief organizations and news agencies).  This notebook is a basic demonstration of the process used in classification of fake versus real disaster tweets.

**About Data:**

Files

* train.csv - the training set
* test.csv - the test set
* sample_submission.csv - a sample submission file in the correct format
* Columns

Columns

* id - a unique identifier for each tweet
* text - the text of the tweet
* location - the location the tweet was sent from (may be blank)
* keyword - a particular keyword from the tweet (may be blank)
* target - in train.csv only, this denotes whether a tweet is about a real disaster (1) or not (0)


**Target** :

To Predict whether a given tweet is about a real disaster or not. If so, predict a 1. If not, predict a 0.

# **Importing Libraries**

In [None]:
import numpy as np
import pandas as pd

#Data Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import plotly.offline
import plotly.express as px
import plotly.graph_objects as go

#Natural Language Processing
#Data Manipulation and Cleaning
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
from collections import Counter
stop = set(stopwords.words('english'))
import re
from nltk.tokenize import word_tokenize
import gensim
import string

#Modeling
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Loading Data

In [None]:
Train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')


Creating Train copy to perform Exploratory Data Analysis

In [None]:
train = Train.copy()

In [None]:
train.head()

In the data the target column shows 1 for real disaster tweet and 0 for fake disaster tweet.

In [None]:
print('There are {} rows and {} columns in train'.format(train.shape[0], train.shape[1]))

# Exploratory Data Analysis and Visualisation 

In [None]:
#Category counts for type of tweets
Category_count=np.array(train['target'].value_counts())
Tweet_type=sorted(train['target'].unique())

In [None]:
fig = go.Figure(data=[go.Pie(labels=Tweet_type, values=Category_count, hole=.3)])
fig.show()

The Donut chart shows that 57 % of the Disaster Tweets are Fake. Let's check the actual counts.

In [None]:
train['target'].value_counts()

So the actual counts show that more than 4000+ Disasters tweets are fake and 3000+ disasters tweets are real. 

# Exploratory Data Analysis

Analysis on character level, word level and sentence level.

# **Number of characters in tweets**

In [None]:
#Adding lenght column to dataset
train['length']=train['text'].apply(len)
train.head()

In [None]:
#checking length distribution
import plotly.express as px
fig = px.histogram(train, x="length", color="target")
fig.show()

# **Number of words in tweets**

In [None]:
train['word_count']=train['text'].str.split().map(lambda x: len(x))

In [None]:
import plotly.express as px
fig = px.histogram(train, x="word_count", color="target")
fig.show()

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
#Tweet with max length
train[train['length']==157]['text'].iloc[0]

In [None]:
train[train['length']==7]['text'].iloc[0]

In [None]:
#Tweet with max word count
train[train['word_count']==31]['text'].iloc[0]

In [None]:
train[train['word_count']==1]['text'].iloc[0]

# **Average word length in a tweet**

In [None]:
avg_word_length=train['text'].str.split().apply(lambda x : [len(i) for i in x])
train['avg_word_length']=avg_word_length.map(lambda x: np.mean(x))

In [None]:
train.head()

In [None]:
import plotly.express as px
fig = px.histogram(train, x="avg_word_length", color="target")
fig.show()

In [None]:
#Creating Tweet Corpus function
def create_corpus(target):
    corpus=[]
    
    for x in train[train['target']==target]['text'].str.split():
        for i in x:
            corpus.append(i)
            
    return corpus        

# **Common stopwords in tweets**

First we will analyze stopwords in real tweets

In [None]:
corpus = create_corpus(1)
dic = defaultdict(int)
for word in corpus:
    if word in stop:
        dic[word]+=1
        
top = sorted(dic.items(), key = lambda x:x[1], reverse = True)[:10]        

In [None]:
x,y = zip(*top)
plt.bar(x,y, color = 'pink')

Analyzing stopwords in Fake Tweets.

In [None]:
corpus = create_corpus(0)
dic = defaultdict(int)
for word in corpus:
    if word in stop:
        dic[word]+=1
        
top = sorted(dic.items(), key = lambda x:x[1], reverse = True)[:10]        

In [None]:
x,y = zip(*top)
plt.bar(x,y, color = 'pink')

# **Analyzing punctuations.**

First let's check tweets indicating real disaster tweets.

In [None]:
plt.figure(figsize = (10,5))
corpus = create_corpus(1)

dic = defaultdict(int)
import string
special = string.punctuation
for i in (corpus):
    if i in special:
        dic[i] += 1
        
x, y = zip(*dic.items())
plt.bar(x, y, color='purple')
        

Now we'll look at punctuations of fake tweets 

In [None]:
plt.figure(figsize = (10,5))
corpus = create_corpus(0)

dic = defaultdict(int)
import string
special = string.punctuation
for i in (corpus):
    if i in special:
        dic[i] += 1
        
x, y = zip(*dic.items())
plt.bar(x, y, color = 'purple')
        

# **Analyzing Common words**

**Real Disaster Tweets**

In [None]:
corpus = create_corpus(1)
counter = Counter(corpus)
most = counter.most_common()
x = []
y = []
for word, count in most[:40]:
    if (word not in stop):
        x.append(word)
        y.append(count)

In [None]:
sns.barplot(x=y,y=x)

**Fake Disaster Tweets**

In [None]:
corpus = create_corpus(0)
counter = Counter(corpus)
most = counter.most_common()
x = []
y = []
for word, count in most[:40]:
    if (word not in stop):
        x.append(word)
        y.append(count)

In [None]:
sns.barplot(x=y,y=x)

# **Ngram Analysis**

we will do a bigram (n=2) analysis over the tweets. Let's check the most common bigrams in tweets

In [None]:
def get_top_tweet_bigrams(corpus, n = None):
    vec = CountVectorizer(ngram_range = (2,2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis = 0)
    words_freq = [(word, sum_words[0, idx]) for word,idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
    return words_freq[:n]
    

In [None]:
plt.figure(figsize = (10,5))
top_tweet_bigrams = get_top_tweet_bigrams(train['text'])[:10]
x,y = map(list, zip(*top_tweet_bigrams))
sns.barplot(x=y, y=x)

# Data Cleaning

As we know,twitter tweets always have to be cleaned before we go onto modelling.So we will do some basic cleaning such as spelling correction,removing punctuations,removing html tags and emojis etc.So let's start.

In [None]:
df = pd.concat([Train,test])
df.shape

# **Removing URLs**

In [None]:
example="New competition launched :https://www.kaggle.com/c/nlp-getting-started"

In [None]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)
remove_URL(example)

In [None]:
df['text'] = df['text'].apply(lambda x : remove_URL(x))

# Removing HTML tags

In [None]:
example = """<div>
<h1>Real or Fake</h1>
<p>Kaggle </p>
<a href="https://www.kaggle.com/c/nlp-getting-started">getting started</a>
</div>"""

In [None]:
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'',text)
print(remove_html(example))

In [None]:
df['text']=df['text'].apply(lambda x : remove_html(x))

# Removing Emojis

In [None]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    
    return emoji_pattern.sub(r'', text)

remove_emoji("Omg another Earthquake 😔😔")

In [None]:
df['text'] = df['text'].apply(lambda x : remove_emoji(x))

# Removing punctuations

In [None]:
def remove_punct(text):
    table = str.maketrans('','', string.punctuation)
    return text.translate(table)

example = "I am a #king"
print(remove_punct(example))

In [None]:
df['text']=df['text'].apply(lambda x : remove_punct(x))

# Glove for Vectorization

Here we will use GloVe pretrained corpus model to represent our words.It is available in 3 varieties :50D ,100D and 200 Dimentional.We will try 100 D here.

In [None]:
def create_corpus(df):
    corpus = []
    for tweet in tqdm(df['text']):
        words = [word.lower() for word in word_tokenize(tweet) if((word.isalpha()==1) & (word not in stop))]
        corpus.append(words)
    return corpus     

In [None]:
corpus = create_corpus(df)

In [None]:
embedding_dict = {}
with open('/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.100d.txt', 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], 'float32')
        embedding_dict[word] = vectors
f.close()        

In [None]:
MAX_LEN = 50
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences = tokenizer_obj.texts_to_sequences(corpus)

tweet_pad = pad_sequences(sequences, maxlen = MAX_LEN, truncating = 'post', padding = 'post')

In [None]:
word_index = tokenizer_obj.word_index
print('Number of unique words:', len(word_index))

In [None]:
num_words = len(word_index)+1
embedding_matrix = np.zeros((num_words, 100))
for word,i in tqdm(word_index.items()):
    if i > num_words:
        continue
        
    emb_vec = embedding_dict.get(word)   
    if emb_vec is not None:
        embedding_matrix[i] = emb_vec

# Baseline Model using LSTM

In [None]:
model=Sequential()

embedding=Embedding(num_words,100,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)

model.add(embedding)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))


optimzer=Adam(learning_rate=1e-5)

model.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
train_ = tweet_pad[:train.shape[0]]
test = tweet_pad[train.shape[0]:]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_,train['target'].values, test_size = 0.15)
print('Shape of train',X_train.shape)
print("Shape of Validation ",X_test.shape)

In [None]:
history = model.fit(X_train,y_train, batch_size = 4, epochs =15, validation_data = (X_test, y_test), verbose = 2)

so, we have got 78% accuracy using LSTM Baseline Model

# Making our submission

In [None]:
sample_sub = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

In [None]:
y_pre=model.predict(test)
y_pre=np.round(y_pre).astype(int).reshape(3263)
sub=pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':y_pre})
sub.to_csv('submission.csv',index=False)

In [None]:
sub.head()

Please upvote my work if it could help! Thank you!