In [1]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

data_path = '/Users/niloufar/Desktop/DeepLearning/tf_specialization/comment/'
data1 = 'spam_or_not1.xlsx'
data2 = 'spam_or_not2.xlsx'
data3 = 'spam_or_not3.xlsx'
df1 = pd.read_excel(data_path + data1)
df2 = pd.read_excel(data_path + data2)
df3 = pd.read_excel(data_path + data3)
df = pd.concat([df1, df2, df3], ignore_index=True)
df = df.drop(['ID', df.columns[3]], axis=1)
# df.iloc[995:1005, :]
df.head()

Unnamed: 0,Text (comment),Spam or ham
0,یه مشت لات و لوت جمع کردید تو این اتاق فرار و ...,Spam
1,سناریو اصلا خوب نبود و برای ما نصفه تموم شد - ...,Spam
2,رفتار پرسنل مناسب نبود\n عدم اگاهی رسانی دقیق...,Spam
3,😡😡😡هشدار این یک کلاه برداری علنی است😡😡😡\nخونه ...,Spam
4,اتاق فرار خوبی بود اما نه به اندازه کامنت ها ق...,Spam


In [2]:
def remove_stopwords(sentence):
    stopwords = ['را', 'به', 'و', 'از', 'که']
    words = sentence.split()
    results_words = [word for word in words if word not in stopwords]
    sentence = ' '.join(results_words)
    return sentence

In [3]:
remove_stopwords("پدر سگ را به درخت بست.")

'پدر سگ درخت بست.'

### Reading the raw data

In [4]:
def parse_data_from_file(file):
    sentences = []
    labels = []
    for i, j in zip(df[df.columns[0]], df[df.columns[1]]):
        sentences.append(remove_stopwords(i))
        labels.append(j)
    return sentences, labels

In [5]:
sentences, labels = parse_data_from_file(df)

In [6]:
print(f"There are {len(sentences)} sentences in the dataset.\n")
print(f"First sentence has {len(sentences[0].split())} words (after removing stopwords).\n")
print(f"There are {len(labels)} labels in the dataset.\n")
print(f"The first 5 labels are {labels[:5]}")

There are 6305 sentences in the dataset.

First sentence has 104 words (after removing stopwords).

There are 6305 labels in the dataset.

The first 5 labels are ['Spam', 'Spam', 'Spam', 'Spam', 'Spam']


### Using the Tokenizer

In [7]:
def fit_tokenizer(sentences):
    tokenizer = Tokenizer(oov_token='<OOV>')
    tokenizer.fit_on_texts(sentences)
    return tokenizer

In [8]:
tokenizer = fit_tokenizer(sentences)
word_index = tokenizer.word_index

print(f"Vocabulary contains {len(word_index)} words\n")
print("<OOV> token included in vocabulary" if "<OOV>" in word_index else "<OOV> token NOT included in vocabulary")

Vocabulary contains 15232 words

<OOV> token included in vocabulary


In [9]:
def get_padded_sequences(tokenizer, sentences):
    sequences = tokenizer.texts_to_sequences(sentences)
    padded_sequences = pad_sequences(sequences, padding='post')
    return padded_sequences

In [10]:
padded_sequences = get_padded_sequences(tokenizer, sentences)
print(f"First padded sequence looks like this: \n\n{padded_sequences[0]}\n")
print(f"Numpy array of all sequences has shape: {padded_sequences.shape}\n")
print(f"This means there are {padded_sequences.shape[0]} sequences in total and each one has a size of {padded_sequences.shape[1]}")

First padded sequence looks like this: 

[  28 3383 6060 6061  573  848   36    9    8   20  110 1568  189  272
    3  688   10  344    3  865  131   16 1274 4251 6062  776   56  689
   16 4252 1802 1569 6063  147 1211 4253 4254  221 6064  508  122  443
 6065  339 6066  795   32 6067 6068    7 1484   78 1802 1569  573  273
   38  429 6069    9 1275 1007  113 3384  122 1803 1804  307 6070   28
   18   92   41   38  308 6071  322 4255 6072  182    9    8   20   12
  204 6073  204 1396  522 1043 1070 6074  509  452 6075   39   10 6076
 2451 4256    9    8   20   12 2182    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0 

In [11]:
def tokenize_labels(labels):
    label_tokenizer = Tokenizer()
    label_tokenizer.fit_on_texts(labels)
    label_word_index = label_tokenizer.word_index
    label_sequences = label_tokenizer.texts_to_sequences(labels)
    return label_sequences, label_word_index

In [12]:
label_sequences, label_word_index = tokenize_labels(labels)
print(f"Vocabulary of labels looks like this {label_word_index}\n")
print(f"First ten sequences {label_sequences[:10]}\n")

Vocabulary of labels looks like this {'ham': 1, 'spam': 2}

First ten sequences [[2], [2], [2], [2], [2], [2], [2], [2], [2], [2]]



In [None]:
# pip install --upgrade pip

In [None]:
# pip install pandas

In [None]:
pip install openpyxl