## Imports:

In [13]:
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import tensorflow_hub as hub
from tensorflow.data import Dataset as tfd
import tensorflow as tf

In [14]:
train_df = pd.read_csv("train_data.txt",
                       engine="python",
                       sep=" ::: ",
                       names=["id", "movie", "genre", "summary"])

test_df = pd.read_csv("test_data_solution.txt",
                      engine="python",
                      sep=" ::: ",
                      names=["id", "movie", "genre", "summary"])

### Viewing a small portion of the dataset

In [15]:
train_df.head()

Unnamed: 0,id,movie,genre,summary
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


## Cleaning Function

In [16]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import PorterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer

def clean_review(review):
    '''
    Input:
        review: a string containing a review.
    Output:
        review_cleaned: a processed review. 
    '''
    review = review[0] if type(review) != str else review
    if type(review) == np.ndarray:
        print(review, review[0])
        review = review[0]
        
    lower_string = review.lower()
    no_url_str = re.sub(r'https?:\/\/\.*','', lower_string)
    clean_str = re.sub(r'[^a-zA-Z]', ' ', no_url_str)  # removing special characters, numbers, punctuations
#     print(clean_str)
    
    stop_words_set = set(stopwords.words('english'))
    stemmer_object = PorterStemmer()  # SnowballStemmer(language='english')
#     lemma_obj = WordNetLemmatizer()
  
    review_str_tokens = word_tokenize(clean_str)
    clean_word_list = [stemmer_object.stem(a_token) for a_token in review_str_tokens if not a_token.lower() in stop_words_set]
    
    clean_review = ' '.join(clean_word_list)
    return clean_review

### Shuffling and Splitting the data

In [17]:
train_shuffled = train_df.sample(frac=1)
X_train, X_val, y_train, y_val = train_test_split(train_shuffled["summary"],
                                                 train_shuffled["genre"],
                                                 test_size=0.1)
X_test, y_test = test_df["summary"], test_df["genre"]

### One hot Encoding

In [18]:
ohe = OneHotEncoder(sparse=False)
ohe.fit(train_shuffled["genre"].to_numpy().reshape(-1,1)) # Fit the encoder to genre of training data

train_ohe = ohe.transform(y_train.to_numpy().reshape(-1, 1))
val_ohe = ohe.transform(y_val.to_numpy().reshape(-1,1))
test_ohe = ohe.transform(y_test.to_numpy().reshape(-1,1))

In [19]:
train_sentences = X_train.tolist()
val_sentences = X_val.tolist()
test_sentences = X_test.tolist()

### Universal Serial Encoder

In [20]:
embedding_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        trainable=False,
                                        name="universal_sentence_encoder")
AUTOTUNE = tf.data.AUTOTUNE

train_dataset = tfd.from_tensor_slices((X_train, train_ohe)).batch(32).prefetch(AUTOTUNE)
val_dataset = tfd.from_tensor_slices((X_val, val_ohe)).batch(32).prefetch(AUTOTUNE)
test_dataset = tfd.from_tensor_slices((X_test, test_ohe)).batch(32).prefetch(AUTOTUNE)

train_dataset, val_dataset, test_dataset

(<PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None, 27), dtype=tf.float64, name=None))>,
 <PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None, 27), dtype=tf.float64, name=None))>,
 <PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None, 27), dtype=tf.float64, name=None))>)

### Model

In [21]:
classes = len(train_shuffled["genre"].value_counts())

# Build the model
from tensorflow.keras import layers

inputs = layers.Input(shape=[], dtype="string")
x = embedding_layer(inputs)
x = layers.Dense(512, activation="relu")(x)
outputs = layers.Dense(classes, activation="softmax")(x)

model = tf.keras.Model(inputs, outputs)

# Compile the model
model.compile(loss="categorical_crossentropy",
              optimizer=tf.keras.optimizers.Adam(),
              metrics=["accuracy"])

model.summary()



Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None,)]                 0         
                                                                 
 universal_sentence_encoder   (None, 512)              256797824 
 (KerasLayer)                                                    
                                                                 
 dense (Dense)               (None, 512)               262656    
                                                                 
 dense_1 (Dense)             (None, 27)                13851     
                                                                 
Total params: 257,074,331
Trainable params: 276,507
Non-trainable params: 256,797,824
_________________________________________________________________


In [22]:
history = model.fit(train_dataset,
                  steps_per_epoch=int(0.1*len(train_dataset)),
                  epochs=5,
                  validation_data=val_dataset,
                  validation_steps=int(0.1*len(val_dataset)))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [23]:
model.evaluate(test_dataset)



[1.3715871572494507, 0.5772877931594849]

Conclusion: