# RNNs: How To Implement A Basic RNN

### Read In, Clean, And Split The Data

In [1]:
# Read in data and split into training and test set
# NOTE: we are NOT cleaning the data
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 1000)

messages = pd.read_csv('C:\\Users\\Yauheni_Leaniuk\\Documents\\Python\\Data_Engineer\\Advanced NLP Python for ML\\data\\spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
labels = np.where(messages['label'] == 'spam', 1, 0)

X_train, X_test, y_train, y_test = train_test_split(messages['text'],
                                                    labels, test_size=0.2)

### Prep Data For Modeling

In [2]:
# Install keras
!pip install -U keras

Collecting keras
  Downloading Keras-2.4.3-py2.py3-none-any.whl (36 kB)
Collecting h5py
  Downloading h5py-3.2.1-cp39-cp39-win_amd64.whl (2.8 MB)
Collecting pyyaml
  Downloading PyYAML-5.4.1-cp39-cp39-win_amd64.whl (213 kB)
Installing collected packages: pyyaml, h5py, keras
Successfully installed h5py-3.2.1 keras-2.4.3 pyyaml-5.4.1
You should consider upgrading via the 'c:\users\yauheni_leaniuk\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.


In [5]:
!pip install -U tensorflow

Collecting tensorflow
  Using cached tensorflow-2.5.0-cp39-cp39-win_amd64.whl (422.6 MB)
Collecting astunparse~=1.6.3
  Using cached astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting keras-nightly~=2.5.0.dev
  Using cached keras_nightly-2.5.0.dev2021032900-py2.py3-none-any.whl (1.2 MB)
Collecting termcolor~=1.1.0
  Using cached termcolor-1.1.0.tar.gz (3.9 kB)
Collecting opt-einsum~=3.3.0
  Using cached opt_einsum-3.3.0-py3-none-any.whl (65 kB)
Collecting tensorflow-estimator<2.6.0,>=2.5.0rc0
  Using cached tensorflow_estimator-2.5.0-py2.py3-none-any.whl (462 kB)
Collecting absl-py~=0.10
  Using cached absl_py-0.12.0-py3-none-any.whl (129 kB)
Collecting gast==0.4.0
  Using cached gast-0.4.0-py3-none-any.whl (9.8 kB)
Collecting grpcio~=1.34.0
  Using cached grpcio-1.34.1-cp39-cp39-win_amd64.whl (2.9 MB)
Collecting google-pasta~=0.2
  Using cached google_pasta-0.2.0-py3-none-any.whl (57 kB)
Collecting keras-preprocessing~=1.1.2
  Using cached Keras_Preprocessing-1.1.2-py2.py3-none-

In [6]:
# Import the tools we will need from keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [7]:
# Initialize and fit the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [8]:
# Use that tokenizer to transform the text messages in the training and test sets
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [9]:
# What do these sequences look like?
X_train_seq[0]

[115,
 17,
 96,
 266,
 243,
 1659,
 18,
 1435,
 2561,
 2562,
 2563,
 512,
 1977,
 2564,
 72,
 3,
 16,
 202,
 4,
 292,
 412,
 175,
 27,
 663,
 164]

In [10]:
# Pad the sequences so each sequence is the same length
X_train_seq_padded = pad_sequences(X_train_seq, 50)
X_train_seq_padded = pad_sequences(X_test_seq, 50)

In [11]:
# What do these padded sequences look like?
X_train_seq_padded

array([[   0,    0,    0, ...,   20,    9,  276],
       [ 325,    8, 1926, ..., 1926,  744,  503],
       [   0,    0,    0, ...,   10,    9,  985],
       ...,
       [   0,    0,    0, ...,    0,  289,  546],
       [   0,    0,    0, ...,   15,   11,  446],
       [   0,    0,    0, ...,   50,  255,  225]])

### Build Model

In [12]:
# Import the tools needed from keras and define functions to calculate recall and precision
import keras.backend as K
from keras.layers import Dense, Embedding, LSTM
from keras.models import Sequential

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

In [23]:
# Construct a simple RNN model
model = Sequential()

model.add(Embedding(len(tokenizer.index_word) + 1, 32))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, None, 32)          253440    
_________________________________________________________________
dense_2 (Dense)              (None, None, 32)          1056      
_________________________________________________________________
dense_3 (Dense)              (None, None, 1)           33        
Total params: 254,529
Trainable params: 254,529
Non-trainable params: 0
_________________________________________________________________


In [None]:
!pip install nump

In [None]:
# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy', precision_m, recall_m])

In [None]:
# Fit the RNN model
history = model.fit(X_train_seq_padded, y_train, 
                    batch_size=32, epochs=10,
                    validation_data=(X_test_seq_padded, y_test))