# Setup

In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [4]:
tf.__version__

'2.2.0'

In [5]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:XLA_GPU:0', device_type='XLA_GPU')]

In [6]:
items = pd.read_csv("../Datasets/ml-100k/Text/items.csv")

In [7]:
data_text = items['Summary']

In [8]:
data_text[0]

'A little boy named Andy loves to be in his room, playing with his toys, especially his doll named "Woody". But, what do the toys do when Andy is not with them, they come to life. Woody believes that his life (as a toy) is good. However, he must worry about Andy\'s family moving, and what Woody does not know is about Andy\'s birthday party. Woody does not realize that Andy\'s mother gave him an action figure known as Buzz Lightyear, who does not believe that he is a toy, and quickly becomes Andy\'s new favorite toy. Woody, who is now consumed with jealousy, tries to get rid of Buzz. Then, both Woody and Buzz are now lost. They must find a way to get back to Andy before he moves without them, but they will have to pass through a ruthless toy killer, Sid Phillips. —John Wiggins'

# AutoEncoder
## Text Data

In [9]:
import re

from keras.optimizers import Adam
from keras.models import Model, Sequential

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.stem import wordnet
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer 
from nltk.tokenize import word_tokenize as wt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.losses import sparse_categorical_crossentropy
from keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, RepeatVector, Embedding, Bidirectional

Using TensorFlow backend.
[nltk_data] Downloading package punkt to /home/sriram/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sriram/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/sriram/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
lem = wordnet.WordNetLemmatizer()
sw = set(stopwords.words("english")) 

In [11]:
# self.input_sequence = Input(shape = (self.max_len,))
# self.embedding = Embedding(input_dim = self.vocab, output_dim=128,)(self.input_sequence)
# self.encoder = Bidirectional(LSTM(self.hidden, return_sequences=False))(self.embedding)
# self.r_vec = RepeatVector(self.max_len)(self.encoder)
# self.decoder = Bidirectional(LSTM(self.hidden, return_sequences=True, dropout=0.2))(self.r_vec)
# self.logits = TimeDistributed(Dense(self.vocab))(self.decoder)
# self.enc_dec_model = Model(inputs = self.input_sequence, outputs = Activation('softmax')(self.logits))
# enc_inp = Input(shape = (self.max_len,))

In [12]:
class Autoencoder_Text(Model):
    def __init__(self, input_=300, hidden=200):
        super(Autoencoder_Text, self).__init__()
        self.max_len = input_
        self.hidden = hidden
    
    def clean_text(self, text):
        text = str(text).lower()
        text = re.sub(r"@[A-Za-z0-9]+", ' ', text)
        text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
        text = re.sub(r"[^a-zA-z.!?'0-9]", ' ', text)
        text = re.sub('\t', ' ',  text)
        text = re.sub(r" +", ' ', text)
        text = wt(text)
        text = (" ").join([lem.lemmatize(i, pos ='v') 
                           for i in text if i not in sw])
        return(text)
    
    def tokenize(self, sentences):
        text_tokenizer = Tokenizer()
        text_tokenizer.fit_on_texts(sentences)
        return(text_tokenizer.texts_to_sequences(sentences), text_tokenizer)
    
    def pre_process(self, data_text, train=False):
        data_text = data_text.apply(self.clean_text)
        text_tokenized, self.text_tokenizer = self.tokenize(data_text)
        
        if train:
            self.vocab = len(self.text_tokenizer.word_index) + 1
            self.max_len = int(len(max(text_tokenized,key=len)))
            
        pad_sentence = pad_sequences(text_tokenized, self.max_len, padding = "post")
        return(pad_sentence.reshape(*pad_sentence.shape, 1))
    
    def model(self):
        self.encoder = Sequential([
            Embedding(input_dim = self.vocab, output_dim=128,),
            Bidirectional(LSTM(self.hidden, return_sequences=False))
        ])
        
        self.r_vec = RepeatVector(self.max_len)
        self.decoder = Sequential([
            Bidirectional(LSTM(self.hidden, return_sequences=True, dropout=0.2)),
            TimeDistributed(Dense(self.vocab))
        ])
        self.enc_dec_model = Sequential([self.encoder, self.r_vec, self.decoder])
    
    def train(self, data_text, epochs=10, batch_size=20):
        pad_sentence = self.pre_process(data_text, True)
        self.model()
        self.enc_dec_model.compile(loss=sparse_categorical_crossentropy,
              optimizer=Adam(1e-3),
              metrics=['accuracy'])
        
        self.enc_dec_model.summary()
        self.enc_dec_model.fit(np.squeeze(pad_sentence, axis = 2), 
                                      pad_sentence, batch_size=batch_size, epochs=epochs)
        self.enc_dec_model.save('./pretrained/text_model')
        
    def call(self, inputs):
        pad_sentences = self.pre_process(inputs, False)
        print(tf.convert_to_tensor(pad_sentences).shape)
        return(self.enc_dec_model(tf.squeeze(pad_sentences, axis = 2)))

In [13]:
AE = Autoencoder_Text()
AE.train(data_text, epochs = 1, batch_size = 30)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequential_1 (Sequential)    (None, 400)               2276416   
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 240, 400)          0         
_________________________________________________________________
sequential_2 (Sequential)    (None, 240, 13672)        6444072   
Total params: 8,720,488
Trainable params: 8,720,488
Non-trainable params: 0
_________________________________________________________________




Epoch 1/1


In [14]:
inputs = np.squeeze(AE.pre_process(data_text), axis = 2)
output = AE.encoder.predict(inputs)
print(output)

[[ 0.01623436  0.01716649 -0.00986839 ...  0.00332202 -0.00862998
  -0.00149595]
 [ 0.01623436  0.01716649 -0.00986839 ... -0.00404957 -0.00246434
   0.00259131]
 [ 0.01623436  0.01716649 -0.00986839 ... -0.00086781 -0.00186458
  -0.00246912]
 ...
 [ 0.01623436  0.01716649 -0.00986839 ... -0.00932547 -0.00054978
   0.01936425]
 [ 0.01623436  0.01716649 -0.00986839 ... -0.02240402 -0.015483
   0.01854579]
 [ 0.01623436  0.01716649 -0.00986839 ... -0.00850242  0.00094222
   0.00521836]]


## Meta-Data