In [1]:
import pandas as pd
train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [2]:
train_df.target.value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [3]:
train_sentences = train_df['text'].to_list()
train_labels = train_df['target'].to_numpy()
train_sentences[:10], train_labels[:30]

(['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
  'Forest fire near La Ronge Sask. Canada',
  "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
  '13,000 people receive #wildfires evacuation orders in California ',
  'Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school ',
  '#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires',
  '#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas',
  "I'm on top of the hill and I can see a fire in the woods...",
  "There's an emergency evacuation happening now in the building across the street",
  "I'm afraid that the tornado is coming to our area..."],
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0]))

In [4]:
len(train_sentences), len(train_labels)

(7613, 7613)

In [5]:
import tensorflow as tf
import tensorflow_hub as hub
tf_hub_embedding_layer = hub.KerasLayer('https://tfhub.dev/google/universal-sentence-encoder/4',
                                       trainable = False,
                                       name = 'universal_sentence_encoder_layer')



In [6]:
def char_split(text):
    return " ".join(list(text))

In [7]:
train_char = [char_split(sentence) for sentence in train_sentences]
train_char[5]

'# R o c k y F i r e   U p d a t e   = >   C a l i f o r n i a   H w y .   2 0   c l o s e d   i n   b o t h   d i r e c t i o n s   d u e   t o   L a k e   C o u n t y   f i r e   -   # C A f i r e   # w i l d f i r e s'

In [8]:
import numpy as np
char_len = [len(char) for char in train_char]
avg_char_len = np.mean(char_len)
avg_char_len

201.0748719295941

In [9]:
output_seq_char_len = int(np.percentile(char_len, 95))
output_seq_char_len

279

In [10]:
import string
alphabets = string.ascii_lowercase + string.punctuation + string.digits
NUM_CHAR_TOKENS = len(alphabets) + 2
char_vectorization = tf.keras.layers.TextVectorization(max_tokens = NUM_CHAR_TOKENS,
                                                      output_sequence_length = output_seq_char_len)
char_vectorization.adapt(train_char)

In [11]:
char_vocab = char_vectorization.get_vocabulary()

In [13]:
char_embedding = tf.keras.layers.Embedding(input_dim = NUM_CHAR_TOKENS,
                                          output_dim = 25,
                                          mask_zero = False,
                                          name = 'char_embed_layer')

In [88]:
# token model:
token_input = tf.keras.layers.Input(shape = [], dtype = tf.string, name = 'token_model_input')
pretrained_embedding_layer = tf_hub_embedding_layer(token_input)
token_output = tf.keras.layers.Dense(128, activation = 'relu')(pretrained_embedding_layer)
token_model = tf.keras.Model(inputs = token_input,
                            outputs = token_output)

# char model:
char_input = tf.keras.layers.Input(shape = (1,), dtype = tf.string, name = 'char_model_input')
char_vectors = char_vectorization(char_input)
char_embed = char_embedding(char_vectors)
char_bi_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(25))(char_embed)
char_model = tf.keras.Model(inputs = char_input,
                            outputs = char_bi_lstm)
# hybrid model:
combined_model = tf.keras.layers.Concatenate(name = 'token_char_hybrid')([token_model.output, char_model.output])

# adding_dropout and dense layer:
combined_dropout = tf.keras.layers.Dropout(0.5)(combined_model)
combined_dense = tf.keras.layers.Dense(200, activation = 'relu')(combined_dropout)
final_dropout = tf.keras.layers.Dropout(0.5)(combined_dense)
output_layer = tf.keras.layers.Dense(1, activation = 'sigmoid')(final_dropout)

# final model:
model = tf.keras.Model(inputs = [token_model.input, char_model.input], outputs = output_layer)

In [89]:
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder()
train_labels_one_hot = one_hot_encoder.fit_transform(train_df['target'].to_numpy().reshape(-1, 1))

In [90]:
train_token_char_data = tf.data.Dataset.from_tensor_slices((train_sentences, train_char))
train_token_char_labels = tf.data.Dataset.from_tensor_slices(train_labels)
train_token_char_datasets = tf.data.Dataset.zip((train_token_char_data, train_token_char_labels))
train_token_char_datasets = train_token_char_datasets.batch(32).prefetch(tf.data.AUTOTUNE)
train_token_char_datasets

<_PrefetchDataset element_spec=((TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.string, name=None)), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [91]:
model.compile(loss = 'binary_crossentropy', optimizer = tf.keras.optimizers.Adam(), metrics = ['accuracy'])

In [92]:
early_callback = tf.keras.callbacks.EarlyStopping(monitor = 'accuracy', patience = 3, restore_best_weights = True)

In [93]:
model.fit(train_token_char_datasets,
         epochs = 32,
         steps_per_epoch = len(train_token_char_datasets),
         callbacks = early_callback)

Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 30/32
Epoch 31/32


<keras.src.callbacks.History at 0x79df312c7c10>

In [94]:
test_sentences = test_df['text'].to_list()
test_sentences[:10]

['Just happened a terrible car crash',
 'Heard about #earthquake is different cities, stay safe everyone.',
 'there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all',
 'Apocalypse lighting. #Spokane #wildfires',
 'Typhoon Soudelor kills 28 in China and Taiwan',
 "We're shaking...It's an earthquake",
 "They'd probably still show more life than Arsenal did yesterday, eh? EH?",
 'Hey! How are you?',
 'What a nice hat?',
 'Fuck off!']

In [95]:
test_char = [char_split(sentence) for sentence in test_sentences]
test_char[:5]

['J u s t   h a p p e n e d   a   t e r r i b l e   c a r   c r a s h',
 'H e a r d   a b o u t   # e a r t h q u a k e   i s   d i f f e r e n t   c i t i e s ,   s t a y   s a f e   e v e r y o n e .',
 't h e r e   i s   a   f o r e s t   f i r e   a t   s p o t   p o n d ,   g e e s e   a r e   f l e e i n g   a c r o s s   t h e   s t r e e t ,   I   c a n n o t   s a v e   t h e m   a l l',
 'A p o c a l y p s e   l i g h t i n g .   # S p o k a n e   # w i l d f i r e s',
 'T y p h o o n   S o u d e l o r   k i l l s   2 8   i n   C h i n a   a n d   T a i w a n']

In [96]:
self_sentence = ['Yash is a good boy. He study in class 10th.', 'Yesterday a man died due to cyclone.', 'There is no disaster yesterday, all were happy.']
self_sentence


['Yash is a good boy. He study in class 10th.',
 'Yesterday a man died due to cyclone.',
 'There is no disaster yesterday, all were happy.']

In [97]:
self_char = [char_split(sentence) for sentence in self_sentence]
self_char

['Y a s h   i s   a   g o o d   b o y .   H e   s t u d y   i n   c l a s s   1 0 t h .',
 'Y e s t e r d a y   a   m a n   d i e d   d u e   t o   c y c l o n e .',
 'T h e r e   i s   n o   d i s a s t e r   y e s t e r d a y ,   a l l   w e r e   h a p p y .']

In [98]:
tf.round(model.predict(x = (tf.constant(self_sentence),
                  tf.constant(self_char))))



<tf.Tensor: shape=(3, 1), dtype=float32, numpy=
array([[0.],
       [1.],
       [0.]], dtype=float32)>

In [99]:
test_sentences = test_df['text'].to_list()
test_sentences[:5]

['Just happened a terrible car crash',
 'Heard about #earthquake is different cities, stay safe everyone.',
 'there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all',
 'Apocalypse lighting. #Spokane #wildfires',
 'Typhoon Soudelor kills 28 in China and Taiwan']

In [100]:
test_char = [char_split(sentence) for sentence in test_sentences]
test_char[:5]

['J u s t   h a p p e n e d   a   t e r r i b l e   c a r   c r a s h',
 'H e a r d   a b o u t   # e a r t h q u a k e   i s   d i f f e r e n t   c i t i e s ,   s t a y   s a f e   e v e r y o n e .',
 't h e r e   i s   a   f o r e s t   f i r e   a t   s p o t   p o n d ,   g e e s e   a r e   f l e e i n g   a c r o s s   t h e   s t r e e t ,   I   c a n n o t   s a v e   t h e m   a l l',
 'A p o c a l y p s e   l i g h t i n g .   # S p o k a n e   # w i l d f i r e s',
 'T y p h o o n   S o u d e l o r   k i l l s   2 8   i n   C h i n a   a n d   T a i w a n']

In [101]:
model_pred_probs = model.predict(x = (tf.constant(test_sentences),
                  tf.constant(test_char)))
model_preds = tf.round(model_pred_probs)
model_preds[:10]



<tf.Tensor: shape=(10, 1), dtype=float32, numpy=
array([[0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.]], dtype=float32)>

In [103]:
my_submission = pd.DataFrame({'Id': test_df.id, 'target': tf.squeeze(model_preds)})
submission = my_submission.to_csv('submission.csv', index = False)
