## Load dataset

In [1]:
import pandas as pd
import os
import pathlib

In [2]:
USE_PROJECT_ROOT = True
BASE_DIR = pathlib.Path().resolve()
if USE_PROJECT_ROOT:
    BASE_DIR = BASE_DIR.parent
DATASET_DIR = BASE_DIR / "datasets"
EXPORT_DIR = DATASET_DIR / "exports"
DATASET_CSV_PATH = EXPORT_DIR / 'spam-dataset.csv'
TRAINING_DATA_PATH = EXPORT_DIR / 'spam-training-data.pkl'
print("BASE_DIR is", BASE_DIR)

BASE_DIR is /home/cyrilng/ai-api


In [4]:
DATASET_CSV_PATH

PosixPath('/home/cyrilng/ai-api/datasets/exports/spam-dataset.csv')

In [6]:
RUN_DATASET_PREPARE = False
if RUN_DATASET_PREPARE:
    # if active, this will download and prepare the dataset.
    SOURCE_NB = pathlib.Path('1 - Prepare the AI Spam Classifier Dataset.ipynb')
    if SOURCE_NB.exists():
        %run './{SOURCE_NB}'
    else:
        print("Prepare the AI Spam Classifier Dataset.ipynb does not exist.")

In [7]:
if not DATASET_CSV_PATH.exists():
    raise Exception(f"You must download or create the spam-dataset.csv \n{DATASET_CSV_PATH} not found.")

In [8]:
df = pd.read_csv(str(DATASET_CSV_PATH))
df.head()

Unnamed: 0,label,text,source
0,ham,"Go until jurong point, crazy.. Available only ...",uci-spam-sms
1,ham,Ok lar... Joking wif u oni...,uci-spam-sms
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,uci-spam-sms
3,ham,U dun say so early hor... U c already then say...,uci-spam-sms
4,ham,"Nah I don't think he goes to usf, he lives aro...",uci-spam-sms


## Convert dataset to lists

In [9]:
texts = df['text'].tolist()
labels = df['label'].tolist()

In [10]:
labels_legend = {'ham': 0, 'spam': 1}
labels_legend_inverted = {f"{v}":k for k,v in labels_legend.items()}

In [11]:
labels_as_int =  [labels_legend[str(x)] for x in labels]

In [12]:
import random
random_idx = random.randint(0, len(texts))
print('Random Index', random_idx)

assert texts[random_idx] == df.iloc[random_idx].text
assert labels[random_idx] == df.iloc[random_idx].label
assert labels_legend_inverted[str(labels_as_int[random_idx])] == labels[random_idx]

Random Index 1154


## Tokenize texts

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer

2023-02-21 11:26:26.955173: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-21 11:26:27.255412: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-02-21 11:26:27.255441: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-02-21 11:26:27.819848: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [14]:
MAX_NUM_WORDS=280

In [15]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 9538 unique tokens.


In [16]:
assert len(sequences) == len(texts) == len(labels_as_int)

## Create X, y training sets

In [17]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_SEQUENCE_LENGTH = 280

X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [18]:
import numpy as np
from tensorflow.keras.utils import to_categorical

y = to_categorical(np.asarray(labels_as_int))

## Split our training data

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Export our training data

In [20]:
import pickle


In [21]:
training_data = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test,
    'max_words': MAX_NUM_WORDS,
    'max_sequence': MAX_SEQUENCE_LENGTH,
    'legend': labels_legend,
    'labels_legend_inverted': labels_legend_inverted,
    "tokenizer": tokenizer,
}

In [22]:
with open(TRAINING_DATA_PATH, 'wb') as f:
    pickle.dump(training_data, f)

In [23]:
data = {}

with open(TRAINING_DATA_PATH, 'rb') as f:
    data = pickle.load(f)