<a href="https://colab.research.google.com/github/SarinaMashreghi/ML-notebooks/blob/main/NLP_tutorial_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##data


In [None]:
!wget https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip

--2023-03-07 16:16:14--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 64.233.182.128, 173.194.193.128, 173.194.194.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|64.233.182.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2023-03-07 16:16:14 (105 MB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



In [None]:
import zipfile

zip_ref = zipfile.ZipFile("nlp_getting_started.zip")
zip_ref.extractall()
zip_ref.close()

In [None]:
import pandas as pd

train = pd.read_csv("/content/train.csv")
test = pd.read_csv("/content/test.csv")

In [None]:
train.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [None]:
len(train), len(test)

(7613, 3263)

In [None]:
import random
train["text"][random.randint(0, len(train)-1)]

'@accionempresa China\x89Ûªs stock market crash this summer has sparked interest from bargain hunt... http://t.co/gO0pkrFzMF @gerenciatodos å¨'

In [None]:
from sklearn.model_selection import train_test_split

train_text, valid_text, train_labels, valid_labels = train_test_split(train["text"].to_numpy(), 
                                                                     train["target"].to_numpy(),
                                                                     test_size=0.1,
                                                                     random_state=42)

test_text = test["text"]

In [None]:
len(train_text), len(valid_text)

(6851, 762)

##tokenization

In [None]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

#default
text_vectorizer = TextVectorization(max_tokens=None,
                                    standardize="lower_and_strip_punctuation",
                                    split="whitespace",
                                    ngrams=None, 
                                    output_mode="int",
                                    output_sequence_length=None)

In [None]:
avg_len = round(sum(len(i.split()) for i in train_text)/len(train_text))
avg_len

15

In [None]:
max_vocab_length = 10000

In [None]:
text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=avg_len)

In [None]:
text_vectorizer.adapt(train_text)

In [None]:
rand_sentence = random.choice(train_text)
print(f"original sentence: {rand_sentence} \n\n vectorized: ")
text_vectorizer([rand_sentence])

NameError: ignored

##embeddings


In [None]:
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length,
                             output_dim=128,
                             input_length=avg_len)

In [None]:
rand_sentence = random.choice(train_text)
print(rand_sentence)
embedding(text_vectorizer([rand_sentence]))

NameError: ignored

##Naive Bayes _ tf-idf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

model_nb = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", MultinomialNB())
])

model_nb.fit(train_text, train_labels)

In [None]:
nb_score = model_nb.score(valid_text, valid_labels)
nb_score

0.7782152230971129

In [None]:
#evaluation function

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def evaluation(y_true, y_pred):
  eval = {}
  eval["accuracy"] = accuracy_score(y_true, y_pred)
  eval["f1"] = f1_score(y_true, y_pred)
  eval["precision"] = precision_score(y_true, y_pred)
  eval["recall"] = recall_score(y_true, y_pred)

  return eval


In [None]:
y_pred = model_nb.predict(valid_text)
evaluation(valid_labels, y_pred)

{'accuracy': 0.7782152230971129,
 'f1': 0.7703527809038113,
 'precision': 0.792992256322435,
 'recall': 0.7782152230971129}

##feed forward neural network

In [None]:
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1)(x)
model_dense = tf.keras.Model(inputs, outputs)

In [None]:
model_dense.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding_1 (Embedding)     (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
Non-tra

In [None]:
model_dense.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                    optimizer="adam",
                    metrics=["accuracy"])
model_dense.fit(train_text, train_labels, epochs=10, 
                validation_data=(valid_text, valid_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc6930fda60>

In [None]:
model_dense.evaluate(valid_text, valid_labels)



[0.6328520774841309, 0.7755905389785767]

In [None]:
preds = tf.keras.activations.sigmoid(model_dense.predict(valid_text))
preds = tf.squeeze(tf.cast(tf.round(preds), tf.int32))
# preds
evaluation(valid_labels, preds)



{'accuracy': 0.7650918635170604,
 'f1': 0.7198748043818467,
 'precision': 0.759075907590759,
 'recall': 0.6845238095238095}

In [None]:
model_dense.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding_1 (Embedding)     (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d_1   (None, 128)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_2 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
Non-tra

###embedding projector

In [None]:
embed_weights = model_dense.get_layer("embedding_1").get_weights()
embed_weights

[array([[ 0.00888562, -0.09413256,  0.07076585, ..., -0.0352413 ,
         -0.00812659, -0.02846153],
        [-0.02164671, -0.03582695, -0.01340451, ..., -0.0601631 ,
          0.07407307, -0.0132545 ],
        [-0.03160125, -0.00958726,  0.02159344, ..., -0.0484019 ,
          0.0558673 ,  0.00854502],
        ...,
        [-0.05878653,  0.06276495, -0.09682883, ..., -0.08860903,
         -0.01544397, -0.11498877],
        [-0.04951746, -0.03317036,  0.03308517, ..., -0.00350804,
          0.04423368,  0.01025897],
        [-0.01844122,  0.01108814, -0.0166669 , ..., -0.02731975,
         -0.01029252, -0.03976574]], dtype=float32)]

In [None]:
import io
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(text_vectorizer.get_vocabulary()):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = embed_weights[0][index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

In [None]:
try:
  from google.colab import files
  files.download('vectors.tsv')
  # files.download('metadata.tsv')
except Exception:
  pass

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

##LSTM model

In [None]:
import tensorflow as tf
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
# x = layers.LSTM(64, return_sequences=True)(x)
x = layers.LSTM(64)(x)
# x = layers.Dense(64, activation="relu")(x)
outputs = layers.Dense(1)(x)
model_lstm = tf.keras.Model(inputs, outputs)
model_lstm.summary()


Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 lstm_6 (LSTM)               (None, 64)                49408     
                                                                 
 dense_4 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,329,473
Trainable params: 1,329,473
Non-trainable params: 0
_________________________________________________

In [None]:
model_lstm.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                   optimizer="adam",
                   metrics=["accuracy"])

model_lstm.fit(train_text, train_labels, validation_split=0.1, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f4da34d0310>

In [None]:
model_lstm.evaluate(valid_text, valid_labels)



[1.2426118850708008, 0.7467191815376282]

In [None]:
preds = tf.keras.activations.sigmoid(model_lstm.predict(valid_text))
preds = tf.squeeze(tf.round(preds))
evaluation(valid_labels, preds)



{'accuracy': 0.7362204724409449,
 'f1': 0.6968325791855203,
 'precision': 0.7064220183486238,
 'recall': 0.6875}

##GRU model

In [None]:
import tensorflow as tf
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GRU(64)(x)
outputs = layers.Dense(1)(x)
model_gru = tf.keras.Model(inputs, outputs)
model_gru.summary()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 gru_1 (GRU)                 (None, 64)                37248     
                                                                 
 dense_6 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,317,313
Trainable params: 1,317,313
Non-trainable params: 0
_________________________________________________

In [None]:
model_gru.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                  optimizer="adam",
                  metrics=["accuracy"])
model_gru.fit(train_text, train_labels, validation_split=0.1, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f4dac616070>

In [None]:
model_gru.evaluate(valid_text, valid_labels)



[1.3279330730438232, 0.7572178244590759]

In [None]:
preds = tf.keras.activations.sigmoid(model_gru.predict(valid_text))
preds = tf.squeeze(tf.round(preds))
evaluation(valid_labels, preds)



{'accuracy': 0.7519685039370079,
 'f1': 0.6936790923824959,
 'precision': 0.7615658362989324,
 'recall': 0.6369047619047619}

##bidirectional RNN

In [None]:
import tensorflow as tf
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.GRU(64))(x)
outputs = layers.Dense(1)(x)

model_bi = tf.keras.Model(inputs, outputs)
model_bi.summary()

Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 bidirectional (Bidirectiona  (None, 15, 128)          98816     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              74496     
 nal)                                                            
                                                           

In [None]:
model_bi.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                 optimizer="adam",
                 metrics=["accuracy"])

model_bi.fit(train_text, train_labels, validation_split=0.1, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f4dad5fe280>

In [None]:
model_bi.evaluate(valid_text, valid_labels)



[1.9371700286865234, 0.7112860679626465]

In [None]:
preds = tf.keras.activations.sigmoid(model_bi.predict(valid_text))
preds = tf.squeeze(tf.round(preds))
evaluation(valid_labels, preds)



{'accuracy': 0.7007874015748031,
 'f1': 0.6779661016949152,
 'precision': 0.6451612903225806,
 'recall': 0.7142857142857143}

##conv1D model

In [None]:
import tensorflow as tf
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.Conv1D(filters=64, kernel_size=5, activation="relu", padding="same")(x)
x = layers.GlobalMaxPool1D()(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model_conv1 = tf.keras.Model(inputs, outputs)
model_conv1.summary()

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_10 (InputLayer)       [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 conv1d_1 (Conv1D)           (None, 15, 64)            41024     
                                                                 
 global_max_pooling1d_1 (Glo  (None, 64)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_9 (Dense)             (None, 1)                 65  

In [None]:
model_conv1.compile(loss="binary_crossentropy",
                    optimizer="adam",
                    metrics=["accuracy"])

model_conv1.fit(train_text, train_labels, validation_split=0.1, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f4da0178ac0>

In [None]:
model_conv1.evaluate(valid_text, valid_labels)



[1.4042891263961792, 0.7257217764854431]

In [None]:
preds = tf.squeeze(tf.round(model_conv1.predict(valid_text)))
evaluation(valid_labels, preds)



{'accuracy': 0.7257217847769029,
 'f1': 0.6912850812407679,
 'precision': 0.6862170087976539,
 'recall': 0.6964285714285714}

##tf hub feature extractor

In [None]:
import tensorflow_hub as hub

url="https://tfhub.dev/google/universal-sentence-encoder/4"
embed = hub.load(url)

In [None]:
sentence_encoder_layer = hub.KerasLayer(url,
                                        input_shape=[],
                                        dtype=tf.string,
                                        trainable=False)



In [None]:
model_tl = tf.keras.Sequential([
    sentence_encoder_layer,
    layers.Dense(64, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])

model_tl.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 512)               256797824 
                                                                 
 dense_8 (Dense)             (None, 64)                32832     
                                                                 
 dense_9 (Dense)             (None, 1)                 65        
                                                                 
Total params: 256,830,721
Trainable params: 32,897
Non-trainable params: 256,797,824
_________________________________________________________________


In [None]:
model_tl.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])

model_tl.fit(train_text, train_labels, validation_split=0.1, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f00ec38db50>

In [None]:
model_tl.evaluate(valid_text, valid_labels)



[0.4019123315811157, 0.8228346705436707]

In [None]:
preds = tf.squeeze(tf.round(model_tl.predict(valid_text)))
evaluation(valid_labels, preds)



{'accuracy': 0.8228346456692913,
 'f1': 0.786053882725832,
 'precision': 0.8406779661016949,
 'recall': 0.7380952380952381}

###10% data

In [None]:
train_10percent = train[["text", "target"]].sample(frac=0.1, random_state=42)
train_10percent

Unnamed: 0,text,target
2644,So you have a new weapon that can cause un-ima...,1
2227,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,Aftershock back to school kick off was great. ...,0
6845,in response to trauma Children of Addicts deve...,0
...,...,...
1153,Today is the day Hiroshima got Atomic bomb 70 ...,1
1034,@Deeeznvtzzz bring the body bags tho,0
3107,Fr cuz I risk being electrocuted every shower ...,0
4379,RT NotExplained: The only known image of infam...,0


In [None]:
train_10_text = train_10percent["text"]
train_10_labels = train_10percent["target"]

In [None]:
import tensorflow as tf
from tensorflow.keras import layers

model_tl2 = tf.keras.Sequential([
    sentence_encoder_layer,
    layers.Dense(64, activation="relu"),
    layers.Dropout(0.2),
    layers.Dense(1, activation="sigmoid")
])

model_tl2.compile(loss="binary_crossentropy",
                  optimizer="adam",
                  metrics=["accuracy"])

model_tl2.fit(train_10_text, train_10_labels,
              validation_data=(valid_text, valid_labels),
              epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f00ff7e6100>

data leakage problem ...

In [None]:
preds = tf.squeeze(tf.round(model_tl2.predict(valid_text)))
evaluation(valid_labels, preds)



{'accuracy': 0.8713910761154856,
 'f1': 0.852852852852853,
 'precision': 0.8606060606060606,
 'recall': 0.8452380952380952}

In [None]:
train_10_text2 = pd.DataFrame(train_text).sample(frac=0.1)
train_10_labels2 = pd.DataFrame(train_labels).sample(frac=0.1)

In [None]:
model_tl3 = tf.keras.Sequential([
    sentence_encoder_layer,
    # layers.Dense(64, activation="relu"),
    # layers.Dropout(0.2),
    layers.Dense(1, activation="sigmoid")
])
model_tl3.compile(loss="binary_crossentropy",
                  optimizer="adam",
                  metrics=["accuracy"])
model_tl3.fit(train_10_text2, train_10_labels2,
              validation_data=(valid_text, valid_labels),
              epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f00e00fb5b0>

##Finding wrong predictions

In [None]:
probs = tf.squeeze(model_tl.predict(valid_text))
preds = tf.round(probs)
val_df = pd.DataFrame({"text": valid_text,
                       "target":valid_labels,
                       "probs":probs,
                       "prediction":preds})

val_df



Unnamed: 0,text,target,probs,prediction
0,So you have a new weapon that can cause un-ima...,1,0.081558,0.0
1,The f$&amp;@ing things I do for #GISHWHES Just...,0,0.067107,0.0
2,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1,0.511115,1.0
3,Aftershock back to school kick off was great. ...,0,0.161729,0.0
4,in response to trauma Children of Addicts deve...,0,0.308383,0.0
...,...,...,...,...
757,@Deeeznvtzzz bring the body bags tho,0,0.071297,0.0
758,Fr cuz I risk being electrocuted every shower ...,0,0.155424,0.0
759,RT NotExplained: The only known image of infam...,0,0.410349,0.0
760,Read this already in '14 but it was and remain...,0,0.120279,0.0


In [None]:
val_df["correctness"] = (val_df["target"]==val_df["prediction"])
val_df

Unnamed: 0,text,target,probs,prediction,correctness
0,So you have a new weapon that can cause un-ima...,1,0.081558,0.0,False
1,The f$&amp;@ing things I do for #GISHWHES Just...,0,0.067107,0.0,True
2,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1,0.511115,1.0,True
3,Aftershock back to school kick off was great. ...,0,0.161729,0.0,True
4,in response to trauma Children of Addicts deve...,0,0.308383,0.0,True
...,...,...,...,...,...
757,@Deeeznvtzzz bring the body bags tho,0,0.071297,0.0,True
758,Fr cuz I risk being electrocuted every shower ...,0,0.155424,0.0,True
759,RT NotExplained: The only known image of infam...,0,0.410349,0.0,True
760,Read this already in '14 but it was and remain...,0,0.120279,0.0,True


In [None]:
fp = val_df[val_df["correctness"]==False].sort_values("probs", ascending=False).head(10)
fp

Unnamed: 0,text,target,probs,prediction,correctness
656,Russian customs destroyed a total of 319 tons ...,0,0.982656,1.0,False
158,Over half of poll respondents worry nuclear di...,0,0.964279,1.0,False
608,USW: 'The damage from abandoning the deal coul...,0,0.955624,1.0,False
428,HereÛªs how media in Pakistan covered the cap...,0,0.922767,1.0,False
384,@FoxNewsInsider All Obama is doing is giving a...,0,0.906937,1.0,False
688,Upheaval high note for bush opera http://t.co/...,0,0.897514,1.0,False
455,I hate this damn Milwaukee IndyFest. All the c...,0,0.894666,1.0,False
458,Today was trauma on top of trauma on top of t...,0,0.889181,1.0,False
163,Putin's plan to destroy Western food en masse ...,0,0.875192,1.0,False
704,RT '@NASASolarSystem: Jupiter's Red Spot is a ...,0,0.872328,1.0,False


In [None]:
fn = val_df[val_df["correctness"]==False].sort_values("probs", ascending=True).head(10)
fn

Unnamed: 0,text,target,probs,prediction,correctness
498,Businesses are deluged with invoices. Make you...,1,0.023028,0.0,False
345,burned 129 calories doing 24 minutes of Walkin...,1,0.026382,0.0,False
593,all that panicking made me tired ;__; i want t...,1,0.033068,0.0,False
187,Even if u have your weapon and your badge we g...,1,0.053592,0.0,False
599,Reddit's new content policy goes into effect m...,1,0.054613,0.0,False
516,Hellfire is surrounded by desires so be carefu...,1,0.057821,0.0,False
584,Reddit Will Now Quarantine Offensive Content h...,1,0.058818,0.0,False
159,Me watching Law &amp; Order (IB: @sauldale305)...,1,0.059646,0.0,False
7,@brianroemmele UX fail of EMV - people want to...,1,0.05993,0.0,False
292,@Kirafrog @mount_wario Did you get wrecked again?,1,0.067952,0.0,False


In [None]:
for row in fp.itertuples():
  _,text, target, prob, pred, _ = row
  print(f"target: {target}, pred: {pred}, prob:{prob}\n\n")
  print(text, "\n\n")

target: 0, pred: 1.0, prob:0.9826558828353882


Russian customs destroyed a total of 319 tons of food today phew! Some Italian meats were burned in an incinerator in Pulkovo airport. 


target: 0, pred: 1.0, prob:0.964278519153595


Over half of poll respondents worry nuclear disaster fading from public consciousness http://t.co/YtnnnD631z ##fukushima 


target: 0, pred: 1.0, prob:0.9556236863136292


USW: 'The damage from abandoning the deal could well create a new level of uncertainty...economic upheaval &amp; military unrest' 


target: 0, pred: 1.0, prob:0.9227669835090637


HereÛªs how media in Pakistan covered the capture of terrorist Mohammed Naved http://t.co/f7WqpCEkg2 


target: 0, pred: 1.0, prob:0.9069372415542603


@FoxNewsInsider All Obama is doing is giving a false time schedule on Iran testing there first bomb      Bomb = Nuclear Suicide Vest 


target: 0, pred: 1.0, prob:0.8975135087966919


Upheaval high note for bush opera http://t.co/aWPU0gaE0b #Sydney #News #Aus 


##test data

In [None]:
probs = tf.squeeze(model_tl.predict(test["text"].to_list()))
preds = tf.round(probs)
test_df = pd.DataFrame({"text": test["text"],
                       "probs":probs,
                       "prediction":preds})

test_df



Unnamed: 0,text,probs,prediction
0,Just happened a terrible car crash,0.672186,1.0
1,"Heard about #earthquake is different cities, s...",0.932818,1.0
2,"there is a forest fire at spot pond, geese are...",0.848824,1.0
3,Apocalypse lighting. #Spokane #wildfires,0.964886,1.0
4,Typhoon Soudelor kills 28 in China and Taiwan,0.982368,1.0
...,...,...,...
3258,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,0.911789,1.0
3259,Storm in RI worse than last hurricane. My city...,0.989187,1.0
3260,Green Line derailment in Chicago http://t.co/U...,0.945420,1.0
3261,MEG issues Hazardous Weather Outlook (HWO) htt...,0.741356,1.0


In [None]:
p = test_df.sort_values("probs", ascending=False).head(10)
p

Unnamed: 0,text,probs,prediction
1684,Typhoon Soudelor approaches after 7 killed 2 m...,0.998936,1.0
1444,Family evacuated after weather bomb http://t.c...,0.998776,1.0
1912,#???? #?? #??? #??? Udhampur terror attack: Mi...,0.998558,1.0
1449,13000 evacuated as California firefighters fig...,0.998394,1.0
2820,The 390-Year-Old Tree That Survived the Bombin...,0.998389,1.0
548,#DNR\nRes. buildings shelled &amp; on fire in ...,0.998389,1.0
668,The Catastrophic Effects of Hiroshima and Naga...,0.998258,1.0
2050,Today marks 70 years since the mass murder of ...,0.998242,1.0
1443,#FortitudeValley unit damaged and residents ev...,0.998216,1.0
167,Arson suspect linked to 30 fires caught in Nor...,0.998203,1.0


In [None]:
for row in p.itertuples():
  _, text, prob, pred = row
  print(f"pred: {pred}, prob:{prob}\n\n")
  print(text, "\n\n")

pred: 1.0, prob:0.9989362359046936


Typhoon Soudelor approaches after 7 killed 2 missing in floods in Philippines http://t.co/hALJNnWrwi via @abc7chicago 


pred: 1.0, prob:0.9987761378288269


Family evacuated after weather bomb http://t.co/2A4z8pmvVE 


pred: 1.0, prob:0.9985575079917908


#???? #?? #??? #??? Udhampur terror attack: Militants attack police post 2 SPOs injured - Times of   http://t.co/1KxsGlsTA7 


pred: 1.0, prob:0.9983939528465271


13000 evacuated as California firefighters fight flames to save homes: CLEARLAKE OAKS Calif. ÛÓ Wildfires lik... http://t.co/xwBYeaOWMw 


pred: 1.0, prob:0.9983891248703003


The 390-Year-Old Tree That Survived the Bombing of Hiroshima http://t.co/kEirA8MA3K 


pred: 1.0, prob:0.9983887672424316


#DNR
Res. buildings shelled &amp; on fire in #Gorlovka tonight.
Civilian casualties Jan-July 2015:ÛÓ
164 killed (incl. 16 children)
501 wounded 


pred: 1.0, prob:0.9982579350471497


The Catastrophic Effects of Hiroshima and Nagasaki Atomi

In [None]:
f = test_df.sort_values("probs", ascending=True).head(10)
f

Unnamed: 0,text,probs,prediction
832,#GoT season 5 - funniest season ever. Hilariou...,0.006405,0.0
1198,@MikeParrActor devastated your no longer in em...,0.00774,0.0
397,@ashwilliams1 continues to be the best guest o...,0.008684,0.0
399,Im so bloody excited to see Maisy and Martha,0.009332,0.0
209,To love you love you love you ... Massive Att...,0.009678,0.0
2630,want a new season of The League and Sirens on ...,0.009923,0.0
907,Please tell me Ross isn't actually dead...\nI ...,0.010228,0.0
3245,I just wanna ease your mind and make you feel ...,0.01034,0.0
1808,Would you consider yourself good at giving adv...,0.01071,0.0
2824,How would you like to be remembered? ÛÓ On ho...,0.011557,0.0


In [None]:
for row in f.itertuples():
  _, text, prob, pred = row
  print(f"pred: {pred}, prob:{prob}\n\n")
  print(text, "\n\n")

pred: 0.0, prob:0.0064053721725940704


#GoT season 5 - funniest season ever. Hilarious. #GameOfThrones you crushed it. OMG. #CantStopLaughing #Wow #Comedy 


pred: 0.0, prob:0.007740059867501259


@MikeParrActor devastated your no longer in emmerdale best character with so much more to give #superbactor your going to be missed 


pred: 0.0, prob:0.008684330619871616


@ashwilliams1 continues to be the best guest on @iLoveGGLetters. This week's episode is bloody outrageous. 


pred: 0.0, prob:0.009331697598099709


Im so bloody excited to see Maisy and Martha 


pred: 0.0, prob:0.00967782735824585


To love you love you love you ...  Massive Attack - Angel (HD) https://t.co/9TW34Gffox vÌ_a @YouTube 


pred: 0.0, prob:0.009922823868691921


want a new season of The League and Sirens on Netflix. 


pred: 0.0, prob:0.010228249244391918


Please tell me Ross isn't actually dead...
I hope they're just playing us until the next episode. #emmerdale 


pred: 0.0, prob:0.010340315289795399


I 

##speed/score tradeoff

In [None]:
import time

def pred_time(model, data):
  start = time.perf_counter()
  pred = model.predict(data)
  finish = time.perf_counter()
  return (finish-start)/len(data)

In [None]:
pred_tiem(model_tl, test["text"].to_list()), pred_time(model_nb, test["text"].to_list())



(1.1810715198516846, 1.5588759423878597e-05)