In [1]:
import pathlib
import random
import pandas as pd

In [38]:
BASE_DIR = pathlib.Path().resolve().parent
DATASET_DIR = BASE_DIR / 'dataset'
EXPORT_DIR = DATASET_DIR / 'exports'

SPAM_DATASET_PATH = EXPORT_DIR / 'spam-dataset.csv'
SPAM_DATASET_PATH.exists()

METADATA_EXPORT_PATH = EXPORT_DIR /'spam-metadata.pkl'
TOKENIZER_EXPORT_PATH = EXPORT_DIR /'spam-tokenizer.json'


In [14]:
df = pd.read_csv(SPAM_DATASET_PATH)
df.head()

Unnamed: 0,label,text,source
0,ham,"Go until jurong point, crazy.. Available only ...",sms_spam
1,ham,Ok lar... Joking wif u oni...,sms_spam
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,sms_spam
3,ham,U dun say so early hor... U c already then say...,sms_spam
4,ham,"Nah I don't think he goes to usf, he lives aro...",sms_spam


In [17]:
labels = df.label.tolist()
texts = df.text.tolist()

In [20]:
labels[120], texts[120]

('spam',
 'PRIVATE! Your 2004 Account Statement for 07742676969 shows 786 unredeemed Bonus Points. To claim call 08719180248 Identifier Code: 45239 Expires')

In [21]:
label_legend = {'spam':1,'ham':0}
label_legend_inverted = {f"{v}":k for k,v in label_legend.items()}
label_legend_inverted

{'1': 'spam', '0': 'ham'}

In [22]:
labels_as_int = [ label_legend[i] for i in labels]
labels_as_int[:10]

[0, 0, 1, 0, 0, 1, 0, 0, 1, 1]

In [29]:
random_idx = random.randint(0,len(labels))
assert label_legend_inverted[str(labels_as_int[random_idx])] == df.iloc[random_idx].label
assert texts[random_idx] == df.iloc[random_idx].text
assert labels[random_idx] == df.iloc[random_idx].label

In [39]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
from sklearn.model_selection import train_test_split
import pickle

In [37]:
MAX_NUM_WORDS = 280
MAX_SEQ_LENGTH = 300
tokenizer = Tokenizer(MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index

labels_as_int_array = np.asarray(labels_as_int)

X = pad_sequences(sequences,MAX_SEQ_LENGTH)
Y = to_categorical(labels_as_int_array)


In [42]:
X_train, X_test, y_train, y_test = train_test_split(X,Y,train_size=0.8,random_state=42)


training_data = {
    "X_train": X_train,
    'X_test' :X_test,
    'y_train':y_train,
    'y_test':y_test,
    'max_words':MAX_NUM_WORDS,
    'max_seq_length':MAX_SEQ_LENGTH,
    'label_legend':label_legend,
    'label_legend_inverted':label_legend_inverted
}

tokenizer_json = tokenizer.to_json()
TOKENIZER_EXPORT_PATH.write_text(tokenizer_json)

with open(METADATA_EXPORT_PATH,'wb') as f:
    pickle.dump(training_data,f)

