<a href="https://colab.research.google.com/github/Pedro0504/demo-repo/blob/main/fcc_sms_text_classification_bid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import libraries
try:
  # %tensorflow_version only exists in Colab.
  !pip install tf-nightly
except Exception:
  pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout, Bidirectional, LSTM
from tensorflow.keras.models import Sequential
import matplotlib.pyplot as plt
import random as rd

print(tf.__version__)

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [None]:
train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

# lets explore the dataset
train_data = pd.read_csv(train_file_path, delimiter='\t')
test_data = pd.read_csv(test_file_path, delimiter='\t')
# rename the column of text so is easier to work
train_data.rename(columns ={'ahhhh...just woken up!had a bad dream about u tho,so i dont like u right now :) i didnt know anything about comedy night but i guess im up for it.': 'sms', 'ham':'label'}, inplace = True)
test_data.rename(columns ={'i am in hospital da. . i will return home in evening': 'sms', 'ham':'label'}, inplace = True)

#let's try to balance the data,the data has a class that dominates the labels,
#the trick is to use a copy of train_data,
#pass the data to a function that removes random words to create new examples
#of the less represented class and then concatenate the copy to the original

combine_data = pd.concat([train_data, test_data]).copy()
"""
import random as rd
def transform(text):
  words = text.split()
  ind = rd.randint(0, len(words)-1)
  if len(words)>1:
    return ' '.join(words[:ind]+words[ind+1:])
  return text
spam = combined_data[combined_data['label'] == 'spam']
spam['sms'] = spam['sms'].map(transform)
combine_data = pd.concat([combined_data,spam])
combine_data.shape
"""

In [None]:
#here is necesary to downsize the data from the label ham
ham = combine_data[combine_data['label']=='ham']
spam = combine_data[combine_data['label']=='spam']
"""
ham_min = ham.sample(n=len(spam), random_state = 42)
ham_half = ham.sample(frac = 0.3, random_state = 42)
combine_data = pd.concat([ham_min, spam])
combine_data.shape
"""


In [None]:
MAXLEN = 160
VOCAB_SIZE = 1250
train_data, test_data = train_test_split(combine_data, test_size = 0.3, random_state = 42)

# now we need to get the categorical data in ham into binary
train_label = train_data.pop('label').map({'ham': 0, 'spam': 1})
test_label = test_data.pop('label').map({'ham':0, 'spam':1})


#vectorize
def tok_data(top_info):
  token = Tokenizer(num_words = VOCAB_SIZE)
  token.fit_on_texts(top_info)
  train_sec = token.texts_to_sequences(top_info)
  train_pad = pad_sequences(train_sec, maxlen=MAXLEN).astype(np.float32)
  return train_pad
train_data = tok_data(train_data['sms'])
test_data = tok_data(test_data['sms'])

#test the distribution on the tokenize data to consider the vocabulary size
tok = Tokenizer()
tok.fit_on_texts(combine_data['sms'])
word_index = tok.word_index
word_counts = tok.word_counts
word_df = pd.DataFrame(list(word_counts.items()), columns = ['word', 'count'])
word_df.sort_values(by = 'count', ascending = False)

import seaborn as sns
plt.figure(figsize = (5,5))
sns.histplot(word_df['count'], bins=50, kde =True)
plt.xlabel('frequency')
plt.ylabel('count')

In [None]:
# now we can create the model
# Create Sequential model
#model desing
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout, Bidirectional, LSTM,GRU
model = tf.keras.Sequential([
    Embedding(VOCAB_SIZE, 64, input_length=MAXLEN),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(32)),
    Dense(16, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
    ])

#model compile
model.compile(
    loss = 'binary_crossentropy',
    optimizer = tf.keras.optimizers.Adam(learning_rate = 0.0001),
    metrics = ['accuracy']
    )
model.summary()

In [None]:
# configure an early stop and the history of the model
early_stop = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 6, restore_best_weights = True)
history = model.fit(
    train_data, train_label,
    epochs = 10,
    validation_data = (test_data, test_label),
    callbacks = [early_stop])


In [None]:
loss, accuracy = model.evaluate(test_data, test_label)
print(f'Loss:{loss}')
print(f'Accuracy:{accuracy}')
import seaborn as sns
sns.lineplot(x = history.epoch, y = history.history['accuracy'], label = 'ACC')
sns.lineplot(x = history.epoch, y = history.history['val_accuracy'], label = 'VAL_ACC')

In [None]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):
  if isinstance(pred_text, str):
    pred_text = [pred_text]
  prep_text = tok_data(pred_text)
  prediction_np = model.predict(prep_text)
  if (prediction_np[0]) > 0.5:
    prediction = 'spam'
  else:
    prediction = 'ham'
  return (prediction_np[0],prediction)

pred_text = "how are you doing today?"

test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"]
label = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
final = pd.DataFrame({'sms':test_messages,'label': label})

for i in range(len(final)):
  pred = predict_message(final['sms'][i])
  print(pred, final['label'][i])

In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()
