# SMS Text Classification

## Importing the necessary packages

In [1]:
import tensorflow as tf
import pandas as pd
from tensorflow import keras
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt


## Getting dataset

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

## Loading Dataset into dataframe

In [3]:
col_names = ['label','messages']

# Adding sep="\t" as data is in tsv format
train_dataset = pd.read_csv(train_file_path,sep='\t' ,names=col_names)
test_dataset = pd.read_csv(test_file_path,sep='\t', names=col_names)

## Preprocessing Dataset

When we look into our data using `head()` method we can see that we have two column "messages" and "label". And label define whether our message is spam or ham.

We will encode ham as '0' and spam as '1'.

In [4]:
label_dict = {
    "ham":0,
    "spam":1
}

train_dataset['msg_type'] = train_dataset['label'].map(label_dict)
test_dataset['msg_type'] = train_dataset['label'].map(label_dict)

In [6]:
train_X = train_dataset['messages']
train_y = train_dataset['msg_type']

test_X = test_dataset['messages']
test_y = test_dataset['msg_type']

## Tokenization


In [22]:
vocab_size = 700 # Setting some arbitary value

tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words=vocab_size,
    char_level=False,
    oov_token="",
)

tokenizer.fit_on_texts(train_X)

In [None]:
word_index = tokenizer.word_index
print("Unique Words= ",len(word_index))

## Sequencing and Padding

In [24]:
train_X_seq = tokenizer.texts_to_sequences(train_X)
train_padding = keras.preprocessing.sequence.pad_sequences(train_X_seq, maxlen=50, 
                                                             padding="post", truncating="post")
test_X_seq = tokenizer.texts_to_sequences(test_X)
test_padding = keras.preprocessing.sequence.pad_sequences(test_X_seq, maxlen=50, 
                                                             padding="post", truncating="post")

In [None]:
train_padding.shape

## Creating a model

In [41]:

model = tf.keras.Sequential([
  tf.keras.layers.Embedding(vocab_size, 50,input_length=50),
  tf.keras.layers.GlobalAveragePooling1D(),
  tf.keras.layers.Dense(24, activation='relu'),
  tf.keras.layers.Dense(1,activation='sigmoid')])


In [None]:
model.summary()

In [43]:
model.compile(optimizer='adam',loss="binary_crossentropy", metrics=['accuracy'])

In [None]:
model.fit(train_padding, train_y,epochs=20,validation_data=(test_padding,test_y))

In [None]:
model.evaluate(test_padding, test_y)

## Predict Function

In [None]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):
    pred_text_tokenize = tokenizer.texts_to_sequences([pred_text])
    pred_text_padding =  keras.preprocessing.sequence.pad_sequences(pred_text_tokenize, maxlen=50, 
                                                             padding="post", truncating="post")

    predict = model.predict(pred_text_padding)[0][0]

    if predict>0.5:
        prediction = [predict,'spam']
    else:
        prediction = [predict,'ham']

    return (prediction)

pred_text = "how are you doing today?"

prediction = predict_message(pred_text)
print(prediction)

## Test from freecode camp

In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()
