In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tqdm import tqdm

In [5]:
train = pd.read_csv("../../../data/real_or_not_nlp/train.csv")
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
import re
from string import punctuation
from nltk.corpus import stopwords

from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

english_stopwords = stopwords.words('english')
stemmer = PorterStemmer()

def transform_tweet(text):
    text = re.sub("#", "", text)
    text = re.sub("\@[a-zA-Z_-]+", "", text)
    text = re.sub("^\s+", "", text)
    text = re.sub("\s+$", "", text)
    text = re.sub(f'[{punctuation}]', "", text)
    
    tokenizer = TweetTokenizer()
    words = tokenizer.tokenize(text)
    words = [word for word in words if word not in english_stopwords]
    words = [stemmer.stem(word) for word in words]
    
    return words

In [7]:
transformed_train = [transform_tweet(text) for text in tqdm(train.text)]

100%|██████████| 7613/7613 [00:02<00:00, 2773.81it/s]


In [8]:
transformed_train[0]

['our', 'deed', 'reason', 'earthquak', 'may', 'allah', 'forgiv', 'us']

In [9]:
vocab = {}
for train_sentence in transformed_train:
    for word in train_sentence:
        if word not in vocab:
            vocab[word] = len(vocab)

In [10]:
vocab

{'our': 0,
 'deed': 1,
 'reason': 2,
 'earthquak': 3,
 'may': 4,
 'allah': 5,
 'forgiv': 6,
 'us': 7,
 'forest': 8,
 'fire': 9,
 'near': 10,
 'La': 11,
 'rong': 12,
 'sask': 13,
 'canada': 14,
 'all': 15,
 'resid': 16,
 'ask': 17,
 'shelter': 18,
 'place': 19,
 'notifi': 20,
 'offic': 21,
 'No': 22,
 'evacu': 23,
 'order': 24,
 'expect': 25,
 '13000': 26,
 'peopl': 27,
 'receiv': 28,
 'wildfir': 29,
 'california': 30,
 'just': 31,
 'got': 32,
 'sent': 33,
 'photo': 34,
 'rubi': 35,
 'alaska': 36,
 'smoke': 37,
 'pour': 38,
 'school': 39,
 'rockyfir': 40,
 'updat': 41,
 'hwi': 42,
 '20': 43,
 'close': 44,
 'direct': 45,
 'due': 46,
 'lake': 47,
 'counti': 48,
 'cafir': 49,
 'flood': 50,
 'disast': 51,
 'heavi': 52,
 'rain': 53,
 'caus': 54,
 'flash': 55,
 'street': 56,
 'manit': 57,
 'colorado': 58,
 'spring': 59,
 'area': 60,
 'Im': 61,
 'top': 62,
 'hill': 63,
 'I': 64,
 'see': 65,
 'wood': 66,
 'there': 67,
 'emerg': 68,
 'happen': 69,
 'build': 70,
 'across': 71,
 'afraid': 72,
 'to

In [11]:
len(vocab)

19589

In [74]:
vocab_size = len(vocab)
embeddings_size = 256
bs = 64

model = tf.keras.Sequential(
    [
        tf.keras.layers.Input(shape=(vocab_size,)),
        tf.keras.layers.Embedding(embeddings_size, bs),
        tf.keras.layers.Dense(1, activation='softmax'),
    ]
)

In [75]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 19589, 64)         16384     
_________________________________________________________________
dense_3 (Dense)              (None, 19589, 1)          65        
Total params: 16,449
Trainable params: 16,449
Non-trainable params: 0
_________________________________________________________________


In [68]:
def vectorize_tweet(transformed_tweet):
    vector = np.zeros(vocab_size)
    
    for word in transformed_tweet:
        if word in vocab:
            vector[vocab.get(word)] = 1.
            
    return vector / sum(vector)

In [69]:
vectorized_train = [vectorize_tweet(tweet) for tweet in tqdm(transformed_train)]

100%|██████████| 7613/7613 [00:24<00:00, 317.06it/s]


In [76]:
model(vectorized_train[0])

<tf.Tensor: shape=(19589, 1), dtype=float32, numpy=
array([[1.],
       [1.],
       [1.],
       ...,
       [1.],
       [1.],
       [1.]], dtype=float32)>

In [77]:
model.compile(optimizer='adam', loss=tf.keras.losses.binary_crossentropy, metrics=['acc'])

In [78]:
X = np.array(vectorized_train)
y = train.target.values.astype(np.float)

In [79]:
model.fit(x=X, y=y, batch_size=64, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7ff55acc21d0>

In [80]:
test = pd.read_csv("../../../data/real_or_not_nlp/test.csv")
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [81]:
transformed_test = [transform_tweet(text) for text in tqdm(test.text)]
vectorized_ttes = [vectorize_tweet(tweet) for tweet in tqdm(transformed_test)]

100%|██████████| 3263/3263 [00:01<00:00, 2811.57it/s]
  
100%|██████████| 3263/3263 [00:10<00:00, 321.81it/s]


In [82]:
res = model.predict(x=vectorized_ttes, batch_size=64)



In [87]:
res = res.reshape(19589,)

In [91]:
pd.Series(res).value_counts()

1.0    19589
dtype: int64

In [93]:
test.shape

(3263, 4)

In [94]:
res.shape

(19589,)