## Recurrent Neural Networks

In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### N gram vectors of text sequence

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
# bag of words model

# text preprocessing: 
# 1. making sentence lowercase, removing all punctuation
# 2. removing stopwords like [the, is, and, in ...]
# 3. reducing vocabulary size helps reduce complexity

vectorizer = CountVectorizer(lowercase=True, stop_words='english')
texts = ['cat sat on the mat', 'boy ran on the ramp', 'appple orange orange', 'car turned around the corner']
vectorizer = CountVectorizer()
vector = vectorizer.fit_transform(texts)
print(vector.toarray())

[[0 0 0 0 1 0 1 1 0 0 0 1 1 0]
 [0 0 1 0 0 0 0 1 0 1 1 0 1 0]
 [1 0 0 0 0 0 0 0 2 0 0 0 0 0]
 [0 1 0 1 0 1 0 0 0 0 0 0 1 1]]


In [16]:
# n-gram model 

vectorizer = CountVectorizer(ngram_range=(3, 3))
text = ["The cat chased the dog over the fence"]
bigrams = vectorizer.fit_transform(text)
print(bigrams.toarray())

[[1 1 1 1 1 1]]


### One Hot Encoder

In [17]:
from sklearn.preprocessing import OneHotEncoder

In [40]:
# example of using OneHotEncoder for categorical encoding

sentences = [
    'The cat sat on the mat.',
    'The dog chased the cat.',
    'The mat was soft and fluffy.'
]

unique_words = set()
for sentence in sentences:
    for word in sentence.split():
        unique_words.add(word.lower().replace('.', ''))

unique_words = np.array(np.array(list(unique_words)).reshape(-1, 1))

print(unique_words)


encoder = OneHotEncoder(sparse_output=False)
one_hot_encoded = encoder.fit_transform(unique_words)
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(), index=unique_words.flatten())
one_hot_df

[['sat']
 ['the']
 ['cat']
 ['soft']
 ['fluffy']
 ['and']
 ['was']
 ['dog']
 ['mat']
 ['chased']
 ['on']]


Unnamed: 0,x0_and,x0_cat,x0_chased,x0_dog,x0_fluffy,x0_mat,x0_on,x0_sat,x0_soft,x0_the,x0_was
sat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
the,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
cat,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
soft,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
fluffy,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
and,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
was,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
dog,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mat,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
chased,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
from tensorflow.keras.preprocessing.text import Tokenizer 

In [44]:
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(samples)
sequences = tokenizer.texts_to_sequences(samples)
one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
one_hot_results

Found 9 unique tokens.


array([[0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], shape=(2, 1000))

### Word Embeddings

In [2]:
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Flatten, Dense
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
import os
imdb_dir = './data/aclImdb'
train_dir = os.path.join(imdb_dir, 'train')
labels = []
texts = []
for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)

                

In [None]:
texts[29]

NameError: name 'texts' is not defined

In [None]:
# imdb movie review sentiment prediction


maxlen = 100
training_samples = 200
validation_samples = 10000
max_words = 10000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [70]:
word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique tokens')

Found 88582 unique tokens
