In [1]:
import tensorflow as tf 
import matplotlib.pyplot as plt 
import pandas as pd
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer

##### Reading dataset

In [6]:
train_df = pd.read_csv('./dataset/train.csv')

In [7]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


##### Making pipeline for cleaning data

In [8]:
class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(columns=self.columns_to_drop)
    
def process_text_column(text_column):
    return text_column.replace('[^a-zA-Z0-9" "]', '', regex=True)


pipeline = Pipeline([
    ('drop_columns', DropColumns(columns_to_drop=['keyword', 'location', 'id'])),
    ('process_text', FunctionTransformer(func=process_text_column, validate=False))
])


##### Using pipeline to clean data

In [9]:
train_df = pipeline.transform(train_df)
train_df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this earthquake Ma...,1
1,Forest fire near La Ronge Sask Canada,1
2,All residents asked to shelter in place are be...,1
3,13000 people receive wildfires evacuation orde...,1
4,Just got sent this photo from Ruby Alaska as s...,1


##### Converting text into lower case

In [10]:
train_df['text'] = train_df['text'].apply(lambda x: " ".join(word.lower() for word in x.split()))

##### Counting number of unique words

In [11]:
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count

counter = counter_word(train_df.text)
num_of_unique_words = len(counter)
num_of_unique_words

22546

##### Tokenize text

In [12]:
X_train = train_df['text']
y_train = train_df['target']

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=num_of_unique_words)
tokenizer.fit_on_texts(X_train)
word_index  = tokenizer.word_index
word_index

{'the': 1,
 'a': 2,
 'in': 3,
 'to': 4,
 'of': 5,
 'and': 6,
 'i': 7,
 'is': 8,
 'for': 9,
 'on': 10,
 'you': 11,
 'my': 12,
 'with': 13,
 'it': 14,
 'that': 15,
 'at': 16,
 'by': 17,
 'this': 18,
 'from': 19,
 'be': 20,
 'are': 21,
 'have': 22,
 'was': 23,
 'like': 24,
 'as': 25,
 'up': 26,
 'just': 27,
 'so': 28,
 'me': 29,
 'but': 30,
 'im': 31,
 'amp': 32,
 'not': 33,
 'your': 34,
 'its': 35,
 'out': 36,
 'after': 37,
 'will': 38,
 'all': 39,
 'when': 40,
 'no': 41,
 'an': 42,
 'fire': 43,
 'has': 44,
 'if': 45,
 'we': 46,
 'get': 47,
 'new': 48,
 'via': 49,
 'now': 50,
 'more': 51,
 'about': 52,
 'dont': 53,
 'or': 54,
 'what': 55,
 'people': 56,
 'he': 57,
 'they': 58,
 'been': 59,
 'one': 60,
 'how': 61,
 'over': 62,
 'news': 63,
 'who': 64,
 'into': 65,
 'do': 66,
 'were': 67,
 'video': 68,
 'us': 69,
 '2': 70,
 'can': 71,
 'emergency': 72,
 'disaster': 73,
 'there': 74,
 'police': 75,
 'than': 76,
 'her': 77,
 'would': 78,
 'some': 79,
 'still': 80,
 'his': 81,
 'body': 82,
 '

In [13]:
train_sequence = tokenizer.texts_to_sequences(X_train)

print(X_train[10:15])
print(train_sequence[10:15])

10          three people died from the heat wave so far
11    haha south tampa is getting flooded hah wait a...
12    raining flooding florida tampabay tampa 18 or ...
13                flood in bago myanmar we arrived bago
14    damage to school bus on 80 in multi car crash ...
Name: text, dtype: object
[[578, 56, 597, 19, 1, 275, 430, 28, 598], [824, 599, 2800, 8, 209, 2801, 3441, 650, 2, 630, 7, 196, 3, 599, 2800, 55, 155, 7, 291, 66, 55, 155, 7, 291, 66, 6741, 241], [2802, 241, 1695, 6742, 2800, 1444, 54, 1169, 600, 260, 728, 3442], [217, 3, 4452, 905, 46, 1696, 4452], [225, 4, 178, 382, 10, 3443, 3, 4453, 122, 86, 349]]


##### Applying padding

In [14]:
max_length = 20
train_padded = tf.keras.preprocessing.sequence.pad_sequences(train_sequence, maxlen=max_length, padding='post', truncating='post')

train_padded.shape ,train_padded[10]

((7613, 20),
 array([578,  56, 597,  19,   1, 275, 430,  28, 598,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0]))

##### Decoding token into text

In [29]:
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])

def decode(sequence):
    return " ".join([reverse_word_index.get(idx, '?') for idx in sequence])

decode(train_sequence[10])

'three people died from the heat wave so far'

##### Train Model

In [None]:
tf.random.set_seed(42)
tf.keras.backend.clear_session()

model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(num_of_unique_words, 32, input_length=max_length),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.GRU(128, dropout=0.5),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss="binary_crossentropy", optimizer="adam", 
    metrics=["accuracy"]) 

history = model.fit(train_padded, y_train,
                validation_split=0.2, epochs=10)

##### Testing Model

In [None]:
plt.plot(history.history['loss'], c='b', label='train')
plt.plot(history.history['val_loss'], c='r', label='test')

plt.legend()
plt.show()

In [None]:
plt.plot(history.history['accuracy'], c='b', label='train')
plt.plot(history.history['val_accuracy'], c='r', label='test')

plt.legend()
plt.show()

In [4]:
model = tf.keras.models.load_model('tweets.h5')

In [31]:
pred = model.predict(train_padded)
pred = [1 if i > 0.5 else 0 for i in pred]
pred[:5], y_train[:5]



([1, 1, 1, 1, 1],
 0    1
 1    1
 2    1
 3    1
 4    1
 Name: target, dtype: int64)

In [28]:
accuracy_score(y_train, pred)

ValueError: Found input variables with inconsistent numbers of samples: [7613, 1]

In [None]:
X_test = ['my house is on fire due to blast']
test_sequence = tokenizer.texts_to_sequences(X_test)

In [None]:
test_padded = tf.keras.preprocessing.sequence.pad_sequences(test_sequence, maxlen=max_length, padding='post', truncating='post')
test_padded.shape

In [None]:
pred = model.predict(test_padded)
pred = [1 if i > 0.5 else 0 for i in pred]
pred