<a href="https://colab.research.google.com/github/RaoSharjeelKhan/Machine-Learning/blob/main/LSTM_SMS_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import pandas as pd
from tensorflow import keras
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_hub as hub
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [None]:
df_train = pd.read_csv(train_file_path, sep='\t',header=0)
df_test = pd.read_csv(test_file_path, sep='\t',header=0)
df_train.head()

In [None]:
df_train.isnull().sum(),df_test.isnull().sum()

In [None]:
df_train.columns=['target','text']
df_test.columns=['target','text']
df_train.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample

In [None]:
sns.countplot(data=df_train,x='target')

In [None]:
#We are facing the issue of imbalanced dataset. We cant undersample the mijority class bacause we dont have enough data, 
#so the only option we have isto upsample the mijority class
# separate minority and majority classes
df_ham = df_train[df_train.target=='ham']
df_spam = df_train[df_train.target=='spam']

# upsample minority
df_upsampled = resample(df_spam,
                          replace=True, # sample with replacement
                          n_samples=len(df_ham), # match number in majority class
                          random_state=27) # reproducible results

# combine majority and upsampled minority
df_new = pd.concat([df_ham, df_upsampled])

In [None]:
sns.countplot(data=df_new,x='target')

In [None]:
df_new['text_1'] = df_new['text'].str.replace('\d+', '')
df_test['text_1'] = df_test['text'].str.replace('\d+', '')
df_new.head(14)

In [None]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
   return [lemmatizer.lemmatize(w,'v') for w in w_tokenizer.tokenize(text)]
df_new['text_2'] =df_new.text_1.apply(lemmatize_text)
df_test['text_2'] = df_test.text_1.apply(lemmatize_text)
df_new.head(10)

In [None]:
df_new.target=df_new.target.replace("ham",0)
df_test.target=df_test.target.replace("ham",0)
df_new.target=df_new.target.replace("spam",1)
df_test.target=df_test.target.replace("spam",1)

In [None]:
X_train=df_new.text_2
X_test=df_test.text_2
y_train=df_new.target
y_test=df_test.target

Great! Now we can proceed to the next step.

In [None]:
tokenizer=tf.keras.preprocessing.text.Tokenizer(
    num_words=1000,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True,
    split=' ',
    char_level=False,
    oov_token=''
)
tokenizer.fit_on_texts(X_train)
word_index=tokenizer.word_index
word_index

In [None]:
texts="how are you doing today"
tokenizer.texts_to_sequences([texts])

I think taking 40 as max length will be okay

In [None]:
max_len=40
from tensorflow.keras.preprocessing.sequence import pad_sequences
#Defining a function that will help us in truncation and padding
train_sequences=tokenizer.texts_to_sequences(X_train.values)
test_sequences=tokenizer.texts_to_sequences(X_test.values)
train_sequence_padded=pad_sequences(train_sequences, truncating='post', padding='post', maxlen=max_len)
test_sequence_padded=pad_sequences(test_sequences, truncating='post', padding='post', maxlen=max_len)
test_sequence_padded[5]

In [None]:
model=tf.keras.Sequential([
    tf.keras.layers.Embedding(1000,64,input_length=40),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)), 
    tf.keras.layers.Dense(64,activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
              loss=tf.losses.BinaryCrossentropy(),
              metrics=[tf.metrics.BinaryAccuracy(name='accuracy')])
model.summary()


In [None]:
history = model.fit(train_sequence_padded, y_train,  epochs=10,   batch_size=32, 
                    validation_data=(test_sequence_padded, y_test))

In [None]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):
  sequences=tokenizer.texts_to_sequences([pred_text])
  sequence_padded=pad_sequences(sequences, truncating='post', padding='post', maxlen=max_len)
  pred=model.predict(sequence_padded,verbose=0)
  if pred>0.5:
    prediction="spam"
  else:
    prediction="ham"
  return (prediction)

pred_text = "how are you doing today?"

prediction = predict_message(pred_text)
print(prediction)

In [None]:
pred_text = "wow, is your arm alright. that happened to me one time too"

prediction = predict_message(pred_text)
print(prediction)

I am getting all the predictions right but i think there is something wrong with the testing function. You can check by yourself by predicting the strings given in the function using my model...Thank you


In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()
