## Getting the datasets from kaggle by putting kaggle.json into .kaggle file in colab

In [2]:
from google.colab import files
files.upload()  # This will prompt you to upload the kaggle.json file
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c nlp-getting-started

Saving kaggle.json to kaggle.json
Downloading nlp-getting-started.zip to /content
  0% 0.00/593k [00:00<?, ?B/s]
100% 593k/593k [00:00<00:00, 118MB/s]


In [3]:
!unzip nlp-getting-started.zip

Archive:  nlp-getting-started.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [4]:
import pandas as pd
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
train_df_shuffled = train_df.sample(frac=1) # shuffle
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
963,1393,body%20bag,,new summer long thin body bag hip A word skirt...,0
6341,9064,structural%20failure,Asia,Rightways: Building structural integrity &amp;...,1
5648,8058,refugees,"Geneva, Switzerland",CHPSRE: RT: Refugees: For our followers in Par...,1
3527,5041,eyewitness,Los Angeles... CA... USA,Aug. 06 2015 Radio Show articles ÛÒ \n1] Eye...,0
1055,1524,body%20bags,Menlo Park. SFO. The World.,@asymbina @tithenai I'm hampered by only likin...,0


In [6]:
# The test data doesn't have a target (that's what we'd try to predict)
test_df.head()
train_df.target.value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,4342
1,3271


In [56]:
from sklearn.model_selection import train_test_split

# Use train_test_split to split training data into training and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["text"].to_numpy(),
train_df_shuffled["target"].to_numpy(),
test_size=0.1,
random_state=42)

In [8]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

text_vectorizer = TextVectorization(max_tokens=None, # how many words in the vocabulary (all of the different words in your text)
                                    standardize="lower_and_strip_punctuation", # how to process text
                                    split="whitespace", # how to split tokens
                                    ngrams=None, # create groups of n-words?
                                    output_mode="int", # how to map tokens to numbers
                                    output_sequence_length=None) # how long should the output sequence of tokens be?
                                    # pad_to_max_tokens=True) # Not valid if using max_tokens=None

In [9]:
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))
# Now that is the average number of tokens in the tweets basically how long a tweet is
# Now let us also set maximum number of words to have in our vocabulary to 10000

15

In [10]:
# this customizes the entire process!!
max_vocab_length = 10000
max_length = 15 # model won't see beyond 15 words now

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [11]:
# fitting model
text_vectorizer.adapt(train_sentences)

In [12]:
sample_sentence = "There is an Earthquake"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[ 75,   9,  40, 289,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0]])>

In [13]:
import random
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
      \n\nVectorized version:")
text_vectorizer([random_sentence])

Original text:
On plus side LOOK AT THE SKY LAST NIGHT IT WAS ABLAZE http://t.co/qqsmshaJ3N      

Vectorized version:


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[  11, 1796,  706,  147,   17,    2, 1030,  142,  236,   15,   23,
         554,    1,    0,    0]])>

In [14]:
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5] # most common
bottom_5_words = words_in_vocab[-5:] # least common tokens
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}")
print(f"Bottom 5 least common words: {bottom_5_words}")

Number of words in vocab: 10000
Top 5 most common words: ['', '[UNK]', 'the', 'a', 'in']
Bottom 5 least common words: ['paratroopers', 'paranormal', 'paramore', 'paramedics', 'paraguay']


## Now let us do embedding of text

In [15]:
tf.random.set_seed(42)
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=max_length, # how long is each input
                             name="embedding_1")



In [16]:
# Get a random sentence from training set
random_sentence = random.choice(train_sentences)


# Embed the random sentence (turn it into numerical representation)
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed
# Each word is represented by a vector of size 128 for each 15 words

<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.0298683 ,  0.02182151,  0.03981072, ..., -0.04243722,
          0.01807315, -0.04856914],
        [-0.01591773,  0.00319887,  0.00484834, ..., -0.01847974,
          0.02159813,  0.02071244],
        [-0.00896937, -0.03462081,  0.04334352, ..., -0.0130701 ,
          0.03184524, -0.0465586 ],
        ...,
        [ 0.00608571, -0.01694386,  0.02956127, ...,  0.02532858,
          0.02819366, -0.00179474],
        [-0.01456895, -0.04998534,  0.02617929, ..., -0.02708454,
         -0.03446424,  0.02480647],
        [ 0.04647377, -0.00638048, -0.00371443, ...,  0.03432273,
          0.02702049,  0.04860585]]], dtype=float32)>

## Now it is time to train our model

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create pipeline
model_0 = Pipeline([
                    ("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
                    ("clf", MultinomialNB()) # model the text
])

model_0.fit(train_sentences, train_labels)

In [127]:
print(val_sentences.shape)
base_pred = model_0.predict(val_sentences)

score = model_0.score(val_sentences, val_labels)
print(f"Our baseline model achieves an accuracy of: {score*100:.2f}%")

(762,)
Our baseline model achieves an accuracy of: 78.61%


In [19]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [20]:
base_results = calculate_results(y_true=val_labels,
                                    y_pred=base_pred)
base_results

{'accuracy': 78.60892388451444,
 'precision': 0.799586842990503,
 'recall': 0.7860892388451444,
 'f1': 0.7778465339305505}

## Now let us start training deep learning models to get better results

In [21]:
inputs = layers.Input(shape=(1,), dtype="string") # inputs are 1-dimensional strings
x = text_vectorizer(inputs) # turn the input text into numbers
x = embedding(x) # create an embedding of the numerized numbers
x = layers.GlobalAveragePooling1D()(x) # lower the dimensionality of the embedding
outputs = layers.Dense(1, activation="sigmoid")(x) # create the output layer, want binary outputs so use sigmoid activation
model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense") # construct the model

# this is as basic as it gets also without pooling error occurs because dense layer does not deal with 3D so it can cause an issue

In [22]:
model_1.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

array(['å_? New Ladies Shoulder Tote #Handbag Faux Leather Hobo Purse Cross Body Bag #Womens http://t.co/zujwUiomb3 http://t.co/YklTFj1FnC',
       'Toronto going crazy for the blue jays. Can you imagine if the leafs get good? The city might literally explode.',
       'News Alerts - Glimpses: Hyderabad deluged by heavy rainfall', ...,
       '@morehouse64 It appears our #Govt has lost an #Ethical and or moral relevance. This means the whole #USA population is in danger from them.',
       "The horrific story of being a hostage - The horrific story of being a hostage It's 1974 and on a British... http://t.co/XcQ48OuRvL",
       'Lose bus card.\nPanic.\nKind bus driver.\nReplace bus card.\nFind bus card.\nHeaddesk.'],
      dtype=object)

In [23]:
model_1_history = model_1.fit(train_sentences, # input sentences can be a list of strings due to text preprocessing layer built-in model
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels))

Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.6498 - loss: 0.6500 - val_accuracy: 0.7612 - val_loss: 0.5391
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.8062 - loss: 0.4674 - val_accuracy: 0.7861 - val_loss: 0.4806
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step - accuracy: 0.8529 - loss: 0.3638 - val_accuracy: 0.7900 - val_loss: 0.4687
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - accuracy: 0.8815 - loss: 0.2974 - val_accuracy: 0.7795 - val_loss: 0.4762
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 23ms/step - accuracy: 0.9056 - loss: 0.2479 - val_accuracy: 0.7861 - val_loss: 0.4948


In [24]:
model_1.evaluate(val_sentences, val_labels)

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7953 - loss: 0.4797


[0.49483004212379456, 0.7860892415046692]

In [124]:
print(val_sentences.dtype)
model_1_preds = model_1.predict(val_sentences)
model_1_preds = tf.squeeze(tf.round(model_1_preds))
model_1_results = calculate_results(y_true=val_labels,
                                    y_pred=model_1_preds)
model_1_results

object
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


{'accuracy': 78.60892388451444,
 'precision': 0.7864584398336184,
 'recall': 0.7860892388451444,
 'f1': 0.7834087202874683}

##Now let us use the hero LSTM

In [26]:
from tensorflow.keras import layers
model_2_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     input_length=max_length,
                                     name="embedding_2")


# Create LSTM model
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_2_embedding(x)
print(x.shape)
x = layers.LSTM(64, return_sequences=True)(x) # return vector for each word in the Tweet (we can stack RNN cells as long as return_sequences=True)
x = layers.LSTM(64)(x)
print(x.shape)
x = layers.Dense(64, activation="relu")(x) # optional dense layer on top of output of LSTM cell
outputs = layers.Dense(1, activation="sigmoid")(x)
model_2 = tf.keras.Model(inputs, outputs, name="model_2_LSTM")



(None, 15, 128)
(None, 64)


In [27]:
model_2.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [28]:
model_2_history = model_2.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels))

Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 80ms/step - accuracy: 0.6704 - loss: 0.5822 - val_accuracy: 0.7808 - val_loss: 0.4758
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 31ms/step - accuracy: 0.8519 - loss: 0.3479 - val_accuracy: 0.7480 - val_loss: 0.5976
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 32ms/step - accuracy: 0.9207 - loss: 0.2144 - val_accuracy: 0.7375 - val_loss: 0.6957
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 41ms/step - accuracy: 0.9523 - loss: 0.1359 - val_accuracy: 0.7047 - val_loss: 0.7425
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 32ms/step - accuracy: 0.9592 - loss: 0.1125 - val_accuracy: 0.7152 - val_loss: 0.7145


In [29]:
model_2_pred_probs = model_2.predict(val_sentences)
# Round out predictions and reduce to 1-dimensional array
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
model_2_results = calculate_results(y_true=val_labels,
                                    y_pred=model_2_preds)
model_2_results

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step


{'accuracy': 71.5223097112861,
 'precision': 0.7166078555069777,
 'recall': 0.7152230971128609,
 'f1': 0.715766367411777}

## Let us make this LSTM bidirectional now for better results maybe?

In [30]:
model_3_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     name="embedding_4")

# Build a Bidirectional RNN in TensorFlow
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_3_embedding(x)
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x) # stacking RNN layers requires return_sequences=True
x = layers.Bidirectional(layers.LSTM(64))(x) # bidirectional goes both ways so has double the parameters of a regular LSTM layer
outputs = layers.Dense(1, activation="sigmoid")(x)
model_3 = tf.keras.Model(inputs, outputs, name="model_3_Bidirectional")

In [31]:
model_3.compile(loss = "binary_crossentropy",
                optimizer = tf.keras.optimizers.Adam(),
                metrics = ["accuracy"])

In [32]:
history = model_3.fit(train_sentences,
                    train_labels,
                    epochs=5,
                    validation_data=(val_sentences, val_labels))

Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 61ms/step - accuracy: 0.6781 - loss: 0.5768 - val_accuracy: 0.7756 - val_loss: 0.4735
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 63ms/step - accuracy: 0.8569 - loss: 0.3374 - val_accuracy: 0.7388 - val_loss: 0.5524
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 58ms/step - accuracy: 0.9194 - loss: 0.2118 - val_accuracy: 0.7008 - val_loss: 0.6399
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 60ms/step - accuracy: 0.9576 - loss: 0.1227 - val_accuracy: 0.7480 - val_loss: 0.7543
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 68ms/step - accuracy: 0.9674 - loss: 0.0965 - val_accuracy: 0.7375 - val_loss: 0.9025


In [33]:
model_3_pred_probs = model_3.predict(val_sentences)
model_3_pred  = tf.squeeze(tf.round(model_3_pred_probs))
model_3_results = calculate_results(val_labels, model_3_pred)
model_3_results

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 48ms/step


{'accuracy': 73.75328083989501,
 'precision': 0.7384470150140517,
 'recall': 0.7375328083989501,
 'f1': 0.7379083927640357}

## Now let us try CONV1D layers for finding relation between words

In [34]:
from tensorflow.keras import layers
model_4_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     input_length=max_length,
                                     name="embedding_4")

# Create 1-dimensional convolutional layer to model sequences
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_4_embedding(x)
x = layers.Conv1D(filters=32, kernel_size=5, activation="relu")(x)
x = layers.GlobalMaxPool1D()(x)
# x = layers.Dense(64, activation="relu")(x) # optional dense layer
outputs = layers.Dense(1, activation="sigmoid")(x)
model_4 = tf.keras.Model(inputs, outputs, name="model_4_Conv1D")

# Compile Conv1D model
model_4.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# Get a summary of our 1D convolution model
model_4.summary()



In [35]:
# Fit the model
model_4_history = model_4.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels))

Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 17ms/step - accuracy: 0.6487 - loss: 0.6313 - val_accuracy: 0.7900 - val_loss: 0.4666
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.8339 - loss: 0.3797 - val_accuracy: 0.7874 - val_loss: 0.4898
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.9095 - loss: 0.2391 - val_accuracy: 0.7717 - val_loss: 0.5737
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.9488 - loss: 0.1523 - val_accuracy: 0.7730 - val_loss: 0.6468
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.9647 - loss: 0.1075 - val_accuracy: 0.7717 - val_loss: 0.7119


In [36]:
model_4_pred = model_4.predict(val_sentences)
model_4_pred = tf.squeeze(tf.round(model_4_pred))

model_4_results = calculate_results(val_labels, model_4_pred)
model_4_results

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step


{'accuracy': 77.16535433070865,
 'precision': 0.7712978692502261,
 'recall': 0.7716535433070866,
 'f1': 0.7690389079813207}

## Now let us try the big guns, transfer learning...

In [37]:
import tensorflow_hub as hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4") # load Universal Sentence Encoder
embed_samples = embed([sample_sentence,
                      "When you call the universal sentence encoder on a sentence, it turns it into numbers."])

In [38]:
print(embed_samples[:50])
print("\n\n",embed_samples[0].shape)

tf.Tensor(
[[-0.00783441  0.00661607 -0.02175899 ... -0.01374428 -0.04130764
  -0.02587626]
 [ 0.03596691 -0.08579468 -0.01152743 ... -0.03414335  0.02816024
  -0.00878945]], shape=(2, 512), dtype=float32)


 (512,)


In [39]:
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        input_shape = [],# No matter wht always outputs [512] outputs
                                         dtype = tf.string,
                                        trainable = False) # cause we are transfer learning not fine tuning

In [40]:
inputs = tf.keras.Input(shape=[], dtype=tf.string)

# Doing this because tensorflow hub keras layer is not considered KerasLayer by Sequential :(
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        trainable=False)

In [41]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers

def UniversalSentenceEncoderLayer(input_shape, trainable=False):
  """Creates a Keras layer that wraps the Universal Sentence Encoder from TensorFlow Hub.

  Args:
    input_shape: The shape of the input tensor.
    trainable: Whether the layer's weights should be trainable.

  Returns:
    A Keras layer that can be used to embed sentences.
  """
  # Load the Universal Sentence Encoder model from TensorFlow Hub
  sentence_encoder = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

  # Create a Lambda layer that wraps the sentence encoder
  return layers.Lambda(lambda x: sentence_encoder(x), input_shape=input_shape, trainable=trainable,
                       output_shape=(512,))

inputs = tf.keras.Input(shape=(), dtype=tf.string)

# Use the custom UniversalSentenceEncoderLayer instead of hub.KerasLayer
sentence_encoder_layer = UniversalSentenceEncoderLayer(input_shape=(), trainable=False)

# Apply the USE layer to the input tensor
x = sentence_encoder_layer(inputs)

# Add Dense layers
x = layers.Dense(128, activation="relu")(x)
x = layers.Dense(64, activation="relu")(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

# Create a Keras model using the functional API
model_6 = tf.keras.Model(inputs=inputs, outputs=outputs, name="model_6_USE")

# Compile the model
model_6.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# Show the model summary
model_6.summary()

  super().__init__(**kwargs)


In [45]:
model_6_history = model_6.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels))

Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.9009 - loss: 0.2610 - val_accuracy: 0.7992 - val_loss: 0.5246
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.9179 - loss: 0.2163 - val_accuracy: 0.7913 - val_loss: 0.5924
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.9383 - loss: 0.1750 - val_accuracy: 0.7835 - val_loss: 0.6727
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.9513 - loss: 0.1420 - val_accuracy: 0.7743 - val_loss: 0.7634
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.9624 - loss: 0.1162 - val_accuracy: 0.7677 - val_loss: 0.8786


In [134]:
train_sentences.dtype

dtype('O')

In [53]:
model_6_prediction = model_6.predict(val_sentences)
model_6_pred = tf.squeeze(tf.round(model_6_prediction))

model_6_results = calculate_results(val_labels, model_6_pred)
model_6_results

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step


{'accuracy': 76.77165354330708,
 'precision': 0.770399431122115,
 'recall': 0.7677165354330708,
 'f1': 0.7684745594608743}

In [130]:
print(score)
print(model_1_results)
print(model_2_results)
print(model_3_results)
print(model_4_results)
print(model_6_results)

# Well looks like our first model is the best model eh? Given less data Deep learning would not work that well.

0.7860892388451444
{'accuracy': 78.60892388451444, 'precision': 0.7864584398336184, 'recall': 0.7860892388451444, 'f1': 0.7834087202874683}
{'accuracy': 71.5223097112861, 'precision': 0.7166078555069777, 'recall': 0.7152230971128609, 'f1': 0.715766367411777}
{'accuracy': 73.75328083989501, 'precision': 0.7384470150140517, 'recall': 0.7375328083989501, 'f1': 0.7379083927640357}
{'accuracy': 77.16535433070865, 'precision': 0.7712978692502261, 'recall': 0.7716535433070866, 'f1': 0.7690389079813207}
{'accuracy': 76.77165354330708, 'precision': 0.770399431122115, 'recall': 0.7677165354330708, 'f1': 0.7684745594608743}


## Let us visualize the most wrong predictions and the most right ones

In [60]:
print(val_sentences.shape)
val_df = pd.DataFrame({"text": val_sentences,
                       "target": val_labels,
                       "pred": model_6_pred,
                       "pred_prob": tf.squeeze(model_6_prediction)})
val_df.head()

(762,)


Unnamed: 0,text,target,pred,pred_prob
0,My portable closet has collapsed 3x and it fin...,0,0.0,0.099034
1,Sound judgement by MPC - premature rises could...,1,0.0,0.010637
2,Christian Attacked by Muslims at the Temple Mo...,1,1.0,0.999626
3,New York City Outbreak: What Is Legionnaire's ...,1,1.0,0.999988
4,8 hours of bagging groceries = an aching body,0,0.0,0.000313


In [62]:
most_wrong = val_df[val_df["target"] != val_df['pred']].sort_values("pred_prob")
most_wrong

Unnamed: 0,text,target,pred,pred_prob
21,#Nursing alert: Emergency Department Psychiatr...,1,0.0,0.000083
321,New #photo Oak in a snowstorm http://t.co/HK9Y...,1,0.0,0.000220
545,@SwellyJetEvo Disneyland! Tacos there are bomb!,1,0.0,0.000306
106,Campsite recommendations \nToilets /shower \nP...,1,0.0,0.000423
97,We happily support mydrought a project bringi...,1,0.0,0.000953
...,...,...,...,...
348,Russian customs destroyed a total of 319 tons ...,0,1.0,0.999858
435,Las Vegas in top 5 cities for red-light runnin...,0,1.0,0.999867
174,'Dangerous' property in downtown Phoenix demol...,0,1.0,0.999926
214,I hate this damn Milwaukee IndyFest. All the c...,0,1.0,0.999966


In [63]:
# Top 5 wrong predictions labelled as 0 when it was actually 1

most_wrong.head()

Unnamed: 0,text,target,pred,pred_prob
21,#Nursing alert: Emergency Department Psychiatr...,1,0.0,8.3e-05
321,New #photo Oak in a snowstorm http://t.co/HK9Y...,1,0.0,0.00022
545,@SwellyJetEvo Disneyland! Tacos there are bomb!,1,0.0,0.000306
106,Campsite recommendations \nToilets /shower \nP...,1,0.0,0.000423
97,We happily support mydrought a project bringi...,1,0.0,0.000953


In [64]:
# top 5 predictions labelled as 1 when it was actually 0
most_wrong.tail()

Unnamed: 0,text,target,pred,pred_prob
348,Russian customs destroyed a total of 319 tons ...,0,1.0,0.999858
435,Las Vegas in top 5 cities for red-light runnin...,0,1.0,0.999867
174,'Dangerous' property in downtown Phoenix demol...,0,1.0,0.999926
214,I hate this damn Milwaukee IndyFest. All the c...,0,1.0,0.999966
253,#hot C-130 specially modified to land in a st...,0,1.0,0.999986


In [140]:
# Making predictions on the test dataset
test_sentences = test_df["text"].tolist()
test_samples = random.sample(test_sentences, 10)
for test_sample in test_samples:
  test_sample_series = pd.Series([test_sample], name='text')
  pred_prob = tf.squeeze(model_1.predict([test_sample_series])) # has to be pandas series format
  pred = tf.round(pred_prob)
  print(f"Pred: {int(pred)}, Prob: {pred_prob}")
  print(f"Text:\n{test_sample}\n")
  print("----\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
Pred: 1, Prob: 0.9264178276062012
Text:
Candlelight vigil at my house tonight for the victims of the mass ant murder (by me) in my bathroom sink. There will be snacks.

----

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Pred: 0, Prob: 0.20561185479164124
Text:
and my dad is high I have a dysfunctional family

----

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
Pred: 1, Prob: 0.9924358129501343
Text:
#hot  C-130 specially modified to land in a stadium and rescue hostages in Iran in 1980 http://t.co/PtAI4zBpbI #prebreak #best

----

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
Pred: 0, Prob: 0.021950004622340202
Text:
@misslyndaleigh The Original Lust  Angel her self Miss Leigh swooping down to cause mayhem &amp; pandemonium x http://t.co/BAnve2Xw4n

----

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
Pred: 0, Pr

In [146]:
# Now predicting on our own stuff

sentence = "It is a major tsunami can lead to loss"
sentence = pd.Series(sentence)
print(tf.squeeze(model_6.predict([sentence])))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
tf.Tensor(0.85289246, shape=(), dtype=float32)
