In [42]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [8]:
data_path = '/Users/niloufar/Desktop/DeepLearning/tf_specialization/comment/'
data1 = 'spam_or_not1.xlsx'
data2 = 'spam_or_not2.xlsx'
data3 = 'spam_or_not3.xlsx'

df1 = pd.read_excel(data_path + data1)
df2 = pd.read_excel(data_path + data2)
df3 = pd.read_excel(data_path + data3)

df = pd.concat([df1, df2, df3], ignore_index=True)
df = df.drop(['ID', df.columns[3]], axis=1)
df.head(18)

Unnamed: 0,Text (comment),Spam or ham
0,یه مشت لات و لوت جمع کردید تو این اتاق فرار و ...,Spam
1,سناریو اصلا خوب نبود و برای ما نصفه تموم شد - ...,Spam
2,رفتار پرسنل مناسب نبود\n عدم اگاهی رسانی دقیق...,Spam
3,😡😡😡هشدار این یک کلاه برداری علنی است😡😡😡\nخونه ...,Spam
4,اتاق فرار خوبی بود اما نه به اندازه کامنت ها ق...,Spam
5,سلام\nمجوز برای اتاق فرار نداشتند\nبرخورد خوب ...,Spam
6,داستان بازی سر و ته مشخصی نداشت،بیشتر فضا سازی...,Spam
7,در خصوص بازی های اتاق فرار با احترام به کسانی ...,Spam
8,نه مجوز\nنه کوچه مطمئن \nنه محله درست\nنه جای ...,Spam
9,افتضاح خواهشن به كامنت ها توجه نكنيد\nنميدونم ...,Spam


In [9]:
def remove_stopwords(sentence):
    stopwords = ['که', 'در', 'از', 'به', 'و', 'را', 'این', 'آن', 'بعد', 'همه', 'دوباره', 'یک', 'یه', 'من', 'تو', 'او', 'ما', 'شما', 'قبل', 'آنها', 'زیرا', 'زیر', 'اما', 'بین', 'دو', 'با', 'اونجا',
                'برای', 'حتما','حالی', 'چرا', 'چی', 'ازطریق', 'رو', ',', 'واقعا','ها', 'تو', 'اون', 'ترین', 'توی', 'چه', 'مارو', 'سر', 'اونجا', 'خود', 'هارو', 'آقا', 'همتون', 'هام', 'دوتا', 'دوباره',
                'اگه', 'ولی', 'روش', 'اینو', 'هنوز', 'ده', 'سه', 'چهار', 'پنج', 'شش', 'هفت', 'هشت', 'نه', 'ده', 'نا','ایی']
    words = sentence.split()
    results_words = [word for word in words if word not in stopwords]
    sentence = ' '.join(results_words)
    return sentence

In [5]:
remove_stopwords("پدر سگ را به درخت بست.")

'پدر سگ درخت بست.'

### Reading the raw data

In [17]:
def parse_data_from_file(file):
    sentences = []
    labels = []
    for i,j in zip(df[df.columns[0]], df[df.columns[1]]):
        sentences.append(remove_stopwords(i))
        labels.append(j)
    return sentences, labels
        

In [18]:
sentences, labels = parse_data_from_file(df)

In [19]:
print(f"There are {len(sentences)} sentences in the dataset.\n")
print(f"First sentence has {len(sentences[0].split())} words (after removing stopwords).\n")
print(f"There are {len(labels)} labels in the dataset.\n")
print(f"The first 5 labels are {labels[:5]}")

There are 6235 sentences in the dataset.

First sentence has 83 words (after removing stopwords).

There are 6235 labels in the dataset.

The first 5 labels are ['Spam', 'Spam', 'Spam', 'Spam', 'Spam']


### Using the Tokenizer

In [55]:
NUM_WORDS = 13861
EMBEDDING_DIM = 16
MAXLEN = 608
PADDING = 'post'
OOV_TOKEN = '<OOV>'
TRAINING_SPLIT = .8

In [21]:
labels.count('ham')

5519

In [22]:
def fit_tokenizer(sentences):
    tokenizer = Tokenizer(oov_token='<OOV>')
    tokenizer.fit_on_texts(sentences)
    return tokenizer

In [28]:
tokenizer = fit_tokenizer(sentences)
print(f"Vocabulary contains {len(tokenizer.word_index)} words\n")
print("<OOV> token included in vocabulary" if "<OOV>" in tokenizer.word_index else "<OOV> token NOT included in vocabulary")

Vocabulary contains 13861 words

<OOV> token included in vocabulary


In [29]:
def get_padded_sequences(tokenizer, sentences):
    sequences = tokenizer.texts_to_sequences(sentences)
    padded_sequences = pad_sequences(sequences, padding='post')
    return padded_sequences

In [30]:
padded_sequences = get_padded_sequences(tokenizer, sentences)
print(f"First padded sequence looks like this: \n\n{padded_sequences[0]}\n")
print(f"Numpy array of all sequences has shape: {padded_sequences.shape}\n")
print(f"This means there are {padded_sequences.shape[0]} sequences in total and each one has a size of {padded_sequences.shape[1]}")

First padded sequence looks like this: 

[3148 5654 5655  554  794    7   12   91 1463  161  236    3  693  324
    3  108 1292 3954 5656  738   45  657 3955 1690 1464 5657  132 1111
 3956 3957  202 5658  458  402 5659 5660  764   21 5661 5662    6 1377
   63 1690 1464  554  238  390 5663 1293  980   96 3149 2039 1842  307
 5664   10   77   28  295 5665  284 3958 5666  158    7   12 5667 1378
  493 1112  981 5668  473  412 5669   25 5670 2301 3959    7   12 2040
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0 

In [33]:
def tokenize_labels(labels):
    label_tokenizer = Tokenizer()
    label_tokenizer.fit_on_texts(labels)
    label_word_index = label_tokenizer.word_index
    label_sequences = label_tokenizer.texts_to_sequences(labels)
    return label_sequences, label_word_index

In [34]:
label_sequences, label_word_index = tokenize_labels(labels)
print(f"Vocabulary of labels looks like this {label_word_index}\n")
print(f"First ten sequences {label_sequences[:10]}\n")

Vocabulary of labels looks like this {'ham': 1, 'spam': 2}

First ten sequences [[2], [2], [2], [2], [2], [2], [2], [2], [2], [2]]



In [43]:
label_seq_np = np.array(label_sequences) - 1
label_seq_np

array([[1],
       [1],
       [1],
       ...,
       [0],
       [0],
       [0]])

### Training - Validation Split

In [47]:
def train_val_split(sentences, label_seq_np, training_split):
    train_size = int(len(sentences)*training_split)
    train_sentences = sentences[:train_size]
    train_labels = labels[:train_size]
    
    validation_sentences = sentences[train_size:]
    validation_labels = labels[train_size:]
    return train_sentences, validation_sentences, train_labels, validation_labels

In [48]:
train_sentences, val_sentences, train_labels, val_labels = train_val_split(sentences, labels, TRAINING_SPLIT)

print(f"There are {len(train_sentences)} sentences for training.\n")
print(f"There are {len(train_labels)} labels for training.\n")
print(f"There are {len(val_sentences)} sentences for validation.\n")
print(f"There are {len(val_labels)} labels for validation.")


There are 4988 sentences for training.

There are 4988 labels for training.

There are 1247 sentences for validation.

There are 1247 labels for validation.


In [58]:
def create_model(num_words, embedding_dim, maxlen):
    
    tf.random.set_seed(123)

    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(num_words, embedding_dim, input_length=maxlen),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(6, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

In [59]:
model = create_model(NUM_WORDS, EMBEDDING_DIM, MAXLEN)

Metal device set to: Apple M1


2023-05-03 11:36:28.917687: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-05-03 11:36:28.918288: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 608, 16)           221776    
                                                                 
 flatten (Flatten)           (None, 9728)              0         
                                                                 
 dense (Dense)               (None, 6)                 58374     
                                                                 
 dense_1 (Dense)             (None, 1)                 7         
                                                                 
Total params: 280,157
Trainable params: 280,157
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(train_padded_seq, train_label_seq, epochs=30, validation_data=(val_padded_seq, val_label_seq))
