#### Import the libraries

In [1]:
# For cleaning and preparing the dataset
# -> dataframe manipulation
# -> text manipulation
# -> Web Scrapping

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from tabulate import tabulate
import re

import random

# Module to serialize the content produced from the execution of the code

import pickle

# Module to monitor the progress of a python for loop

from tqdm import tqdm_notebook

# Module to manipulate text in python - NLTK package

import nltk
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

# Module to compute word vectorizers and compute the cosine distance

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_distances

import string
import itertools

# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:70% !important; }</style>"))

#### Keras Libraries

In [2]:
%%time

from nltk.stem import WordNetLemmatizer

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

from time import time

#--------------------------------------------------------------

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras import layers
from tensorflow.keras import models

from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

#---------------------------------------------------------------

%matplotlib inline
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
from pylab import rcParams

import pydot
import pydotplus
import graphviz

from IPython.display import SVG
from tensorflow.keras.utils import model_to_dot

from tensorflow.keras.models import load_model
from tensorflow.keras.models import model_from_json
import json

# Import ML FLow
import mlflow.tensorflow
import mlflow.pyfunc
from tensorflow.keras import regularizers
import datetime

# Import TensorBoard
import tensorflow_docs as tfdocs
import tensorflow_docs.plots as tfplots
import tensorflow_docs.modeling as tfmodel
from tensorflow.keras import regularizers
# from tensorboard import default
# from tensorboard import program

import tensorflow_hub as hub
import bert
from bert import tokenization
from bert.tokenization import FullTokenizer

#Visualize Model

def visualize_model(model):
    return SVG(model_to_dot(model, show_shapes= True, show_layer_names=True, dpi=65).create(prog='dot', format='svg'))

from tensorflow.keras.utils import plot_model

from packaging import version

print("TensorFlow version: ", tf.__version__)
assert version.parse(tf.__version__).release[0] >= 2, \
    "This notebook requires TensorFlow 2.0 or above."

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

TensorFlow version:  2.1.0
Version:  2.1.0
Eager mode:  True
Hub version:  0.7.0
GPU is NOT AVAILABLE
Wall time: 3.45 s


#### Import the dataset (this demonstrates how the genres have been cleaned)

In [None]:
dataset = pd.read_pickle('dataset_part_2_25012020.pkl')

print("\nThe shape of the dataset that will be used in Keras classifier is: {}".format(dataset.shape))

#### -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

#### Check the correlation between user ratings and IMDB ratings

In [None]:
k2, p = stats.normaltest(dataset.rating)
print("p = {:g}".format(p))

alpha = 0.05
if p < alpha:  # null hypothesis: x comes from a normal distribution
    print("The null hypothesis can be rejected")
else:
    print("The null hypothesis cannot be rejected")

In [None]:
from scipy import stats
dataset['imdb_rating'] = dataset['imdb_rating'].astype(float)

k2, p = stats.normaltest(dataset.imdb_rating)
print("p = {:g}".format(p))

alpha = 0.05
if p < alpha:  # null hypothesis: x comes from a normal distribution
    print("The null hypothesis can be rejected")
else:
    print("The null hypothesis cannot be rejected")

In [None]:
from scipy.stats import spearmanr
rho, pval = spearmanr(dataset.rating,dataset.imdb_rating)

In [None]:
rho

In [None]:
import seaborn as sns
sns.scatterplot('rating','imdb_rating',data=dataset)
plt.title('user rating vs IMDB rating', fontsize=18)
plt.ylabel('IMDB rating', fontsize=16)
plt.xlabel('user rating', fontsize=16)

#### -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

#### Understand the dependent variable: Genres of each movie

Check their frequency distribution

In [None]:
dataset['genres'].explode().value_counts()

In [None]:
round(dataset['genres'].explode().value_counts(normalize=True) * 100,3)

In [None]:
# STEP 1: Remove genres less than 1% frequency

dataset['reduced_genres'] = dataset['genres'].apply(
    lambda row: [val for val in row if val not in ['IMAX', 'Sport', 'Adult', 'News', 'Reality-TV',
                                                   'Film-Noir', 'Short', 'Family', 'Biography', 'Music', 'History']])

In [None]:
dataset['reduced_genres'].shape

In [None]:
# STEP 2: Find indexes with EMPTY LISTST

dataset_empty_lists = dataset[dataset.reduced_genres.apply(lambda c: c==[])]

remove_indices = dataset_empty_lists.index.to_list()

dataset_empty_lists

In [None]:
# STEP 3: Remove the indexes with EMPTY LISTS

dataset_frequent_genres =  dataset[~dataset.index.isin(remove_indices)]

In [None]:
dataset_frequent_genres.shape

In [None]:
dataset_frequent_genres = dataset_frequent_genres.reset_index(drop=True)

dataset_frequent_genres.shape

In [None]:
round(dataset_frequent_genres['reduced_genres'].explode().value_counts(normalize=True) * 100,3)

# remember to cut the percentages of drama and comedy!

In [None]:
dataset_frequent_genres.to_pickle("dataset_part_2_05022020.pkl")

#### -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

#### #1 Import cleaned of redundant genres dataset and genres_list

In [3]:
# import dataset
dataset_frequent_genres = pd.read_pickle('dataset_part_2_05022020.pkl')

print("\nThe shape of the dataset that will be used in Keras classifier is: {}".format(dataset_frequent_genres.shape))


The shape of the dataset that will be used in Keras classifier is: (49123, 13)


In [4]:
# Multy hot encoding since a Movie can have more than 1 genres assigned!

mlb = MultiLabelBinarizer()
dataset_frequent_genres = dataset_frequent_genres.join(pd.DataFrame(mlb.fit_transform(dataset_frequent_genres['reduced_genres']),
                                                                    columns=mlb.classes_,
                                                                    index=dataset_frequent_genres.index))

In [5]:
dataset_frequent_genres.columns

Index(['title', 'genres', 'rating', 'imdb_url', 'reviews_url', 'actors',
       'plot', 'imdb_rating', 'director', 'reviews', 'sentiment_value',
       'movie_features', 'reduced_genres', 'Action', 'Adventure', 'Animation',
       'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
       'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western'],
      dtype='object')

In [None]:
# import genres
genres_list = dataset_frequent_genres.columns[13:30].to_list() #13:30

with open('genres_list_08022020.pkl', 'wb') as handle:
    pickle.dump(genres_list, handle)

#### #2 Prune the movie reviews (keep only the first review for each movie)

In [6]:
dataset_frequent_genres['reviews_length'] = dataset_frequent_genres.reviews.apply(lambda x: len(x))

In [7]:
len(dataset_frequent_genres['reviews_length'][dataset_frequent_genres['reviews_length']==1])

# Since I don't want to loose 3326 movies, I will keep only the first review for each movie.

3326

In [8]:
dataset_frequent_genres.loc[:, 'reviews_pruned'] = dataset_frequent_genres.reviews.apply(lambda x: x[0])

In [9]:
dataset_frequent_genres.reviews_pruned.iloc[7]

'This version of the classic novel by Mark Twain makes a nice little movie, but fails to capture the humorous spirit of the book. Many hilarious scenes are missing and the writers have taken too many liberties with the plot.Jonathan Taylor Thomas is wonderful as Tom Sawyer. Unfortunately his charm does not last until the very end of the movie, but seems to fade after a while. It might possibly have something to do with the fact, that, even though this movie is based on "The Adventures Of Tom Sawyer", it\'s actually Huck\'s character that steals the whole show.I had a bit of a problem with the way Huck Finn was portrayed in this movie. Of course Brad Renfro is a fantastic actor (his performance is probably the best thing about this film) and he manages to give Huck Finn many tragic and deep aspects. Unfortunately Mark Twain\'s Huck Finn is neither tragic or very deep. One can only wonder why Huck has been made the brooding hero, when it\'s actually Tom Sawyer, who is supposed to be the 

In [10]:
table = str.maketrans(dict.fromkeys(string.punctuation))
dataset_frequent_genres.loc[:, 'reviews_pruned'] = dataset_frequent_genres.loc[:, 'reviews_pruned'].apply(lambda x: x.translate(table))

In [11]:
dataset_frequent_genres.reviews_pruned.iloc[7]

'This version of the classic novel by Mark Twain makes a nice little movie but fails to capture the humorous spirit of the book Many hilarious scenes are missing and the writers have taken too many liberties with the plotJonathan Taylor Thomas is wonderful as Tom Sawyer Unfortunately his charm does not last until the very end of the movie but seems to fade after a while It might possibly have something to do with the fact that even though this movie is based on The Adventures Of Tom Sawyer its actually Hucks character that steals the whole showI had a bit of a problem with the way Huck Finn was portrayed in this movie Of course Brad Renfro is a fantastic actor his performance is probably the best thing about this film and he manages to give Huck Finn many tragic and deep aspects Unfortunately Mark Twains Huck Finn is neither tragic or very deep One can only wonder why Huck has been made the brooding hero when its actually Tom Sawyer who is supposed to be the brightest star in this stor

#### -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

#### #3 Unify (join) the columns of Actors and Reviews in order to achive a dataframe cell with a unique TEXT (corpus) and not a LIST of texts

In [12]:
# Function 1: Actors
def unify_actors(row):
    return ', '.join(row['actors'])

# Function 2: Reviews
def unify_reviews(row):
    return ', '.join(row['reviews'])

In [13]:
dataset_frequent_genres['actors_unified'] = dataset_frequent_genres.apply(unify_actors, axis=1)
dataset_frequent_genres['reviews_unified'] = dataset_frequent_genres.apply(unify_reviews, axis=1)

print("Actors before: {}".format(dataset_frequent_genres.actors.iloc[0]))
print("Actors after: {}\n".format(dataset_frequent_genres.actors_unified.iloc[0]))

print("Reviews before: {}".format(dataset_frequent_genres.reviews.iloc[0]))
print("Reviews after: {}".format(dataset_frequent_genres.reviews_unified.iloc[0]))

Actors before: ['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Jim Varney', 'Wallace Shawn', 'John Ratzenberger', 'Annie Potts', 'John Morris', 'Erik von Detten', 'Laurie Metcalf', 'R. Lee Ermey', 'Sarah Freeman', 'Penn Jillette', 'Jack Angel', 'Spencer Aste']
Actors after: Tom Hanks, Tim Allen, Don Rickles, Jim Varney, Wallace Shawn, John Ratzenberger, Annie Potts, John Morris, Erik von Detten, Laurie Metcalf, R. Lee Ermey, Sarah Freeman, Penn Jillette, Jack Angel, Spencer Aste

Reviews before: ["Andy's toys live a reasonable life of fun and peace, their only worries are birthdays and Christmases, when new toys could easily replace those already there.  One such birthday Andy's top toy, Woody the cowboy, finds himself in direct competition with Andy's new Buzz Lightyear doll.  When rivalries boil over Woody tries to hide Buzz down the side of the bed but accidentally pushes him out the window, the other tops expel Woody, and he leaves with no choice but to find Buzz and return him to the h

Reviews after: Andy's toys live a reasonable life of fun and peace, their only worries are birthdays and Christmases, when new toys could easily replace those already there.  One such birthday Andy's top toy, Woody the cowboy, finds himself in direct competition with Andy's new Buzz Lightyear doll.  When rivalries boil over Woody tries to hide Buzz down the side of the bed but accidentally pushes him out the window, the other tops expel Woody, and he leaves with no choice but to find Buzz and return him to the house.  But with only two days before Andy moves house, time is of the essence.Given how often the same mix of animation, wit, jokes and kids humour has been used since Toy Story (Ice Age, Monsters Inc, Bugs Life) it is easy to forget how refreshing it was when it first came out.  I have just watched it again and it is dating a little in comparison to more recent twists on the formula.  It seems each one has to be sharper and have more references etc in the background.  However i

7 out of 10 (an average rating for a Disney animated feature), Toy Story is a good movie. It has good characters and a decent story. But is it really worth the hype? Yes, it's the first fully computer animated movie ever made and that's very impressive. For that reason alone, it will stand out in history. But I certainly never really thought it was the brilliant classic everyone else thought it was. Its animation is great but the story suffers from major pacing issues. There are entire moments of the movie right in the middle that bore me every time and don't stop boring me until I get to the film's climax, when it picks up again. A lot of trimming should have been done in the middle. Toy Story is good but its sequels are a lot better. So is it bad? No. But it's not brilliant. If this were 2-D, I doubt it would receive this high praise., "Toy Story" is a real sight to behold because it is the first feature-length, computer-animated film.  Once you get passed the amazing visual effects,

#### #4 Functions

In [14]:
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

class Keras_Configurations_model1():
    
        MAX_FEATURES = 20000

class Keras_Configurations_model2():

    MAX_FEATURES = 17500
    
class Keras_Configurations_model3():
        
        MAX_FEATURES = 20000

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

# Function 1

def inference_function(indx, model, x_test_seq, x_test, genres_list):
    
    test_sequence = x_test_seq[indx:indx+1]
    
    text_prediction = model.predict(test_sequence)
    
    [float(i) for i in text_prediction[0]]
    
    tag_probabilities = text_prediction[0][np.argsort(text_prediction[0])[-3:]]
    
    indexes = np.argsort(text_prediction[0])[::-1][:3]

    predicted_tags = []
    
    for i, tag in enumerate(genres_list):
        if i in indexes:
            predicted_tags.append(genres_list[i])
    
    return print('\n\nMovie Title: {}'.format(x_test['Movie Title'].iloc[indx]), '\n\nPredicted Genre labels: {}'.format(predicted_tags), '\n\nWith predicted probabilities: {}'.format(tag_probabilities), '\n\nThe actual Genre labels: {}'.format(x_test['Genres'].iloc[indx]), "\n\n", "---------------------------------------------------------------------------------------------------------")

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

# Function 2

def preprocess_text(text):
    
    stop_words = set(stopwords.words('english'))
    
    lemmatizer = WordNetLemmatizer()
    
    no_stopword_text = [word for word in text.split(' ') if not word in stop_words]
    
    lemmatized_text = [lemmatizer.lemmatize(word, pos='v') for word in no_stopword_text]
    
    lowercase_text = [word.lower() for word in lemmatized_text]
    
    return ' '.join(lowercase_text)

def transform_actors(column_name, dataset):
    
#     actors_list = []

#     for i in range(len(actors_column)):
#         actors_list.append([element.lower() for element in actors_column.iloc[i]])

#     dataset.loc[:, 'clean_actors'] = actors_list

    dataset.loc[:, 'clean_actors'] = dataset.loc[:, column_name].apply(lambda x: x.lower())

def transform_plot(column_name, dataset):
    
    stop_words = set(stopwords.words('english'))

    lemmatizer = WordNetLemmatizer()
    
    dataset.loc[:, 'clean_plot_summary'] = dataset.loc[:, column_name].apply(lambda x: preprocess_text(x))

def transform_features(column_name, dataset):
    
    dataset.loc[:, 'clean_combined_features'] = dataset.loc[:, column_name].apply(lambda x: preprocess_text(x))
    
def transform_reviews(column_name, dataset):
    
#     reviews_list = []
    
#     for i in range(len(reviews_column)):
#         reviews_list.append([preprocess_text(element) for element in reviews_column.iloc[i]])

#     dataset.loc[:, 'clean_reviews'] = reviews_list
    
    dataset.loc[:, 'clean_reviews'] = dataset.loc[:, column_name].apply(lambda x: preprocess_text(x))

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

# Function 3.1 (version 1 - create different X trains)

# def split_dataset(column_name, labels, dataset):
    
#     X = dataset[[column_name,'title', 'reduced_genres']]
    
#     y = labels #dataset['reduced_genres']

#     #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, shuffle = True, stratify = y)
#     #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

    
#     return X_train, X_test, y_train, y_test

# version 3.2

def split_dataset(labels, dataset):
    
    X = dataset[['title', 'clean_actors', 'clean_plot_summary', 'clean_combined_features', 'clean_reviews', 'reduced_genres']]
    
    y = labels #dataset['reduced_genres']

    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, shuffle = True, stratify = y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, shuffle= True)
    
    # change test size to 50-50 percent not 80-20!
    return X_train, X_test, y_train, y_test

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# Function 4

def keras_tokenization(variable, x_train, x_test):
    
    if variable == "actors":

        actors_tokenizer = Tokenizer(num_words=Keras_Configurations_model1.MAX_FEATURES, lower=True, split=',', oov_token = '<OOV>')
    
        actors_tokenizer.fit_on_texts(list(x_train.loc[:, 'clean_actors']))

        actors_tokenizer.word_index = {e:i for e,i in actors_tokenizer.word_index.items() if i <= Keras_Configurations_model1.MAX_FEATURES}
    
        actors_tokenizer.word_index[actors_tokenizer.oov_token] = Keras_Configurations_model1.MAX_FEATURES + 1

        x_train.loc[:, 'actors_seqs'] = actors_tokenizer.texts_to_sequences(x_train.loc[:, 'clean_actors'])
    
        x_test.loc[:, 'actors_seqs'] = actors_tokenizer.texts_to_sequences(x_test.loc[:, 'clean_actors'])

        vocabulary_size_frequent_words = len(actors_tokenizer.word_index) + 1
        
        tokenizer = actors_tokenizer
            
    elif variable == "plot":
        
        plot_tokenizer = Tokenizer(num_words=Keras_Configurations_model2.MAX_FEATURES, lower=True, split=' ', oov_token = '<OOV>')
        
        plot_tokenizer.fit_on_texts(list(x_train.loc[:, 'clean_plot_summary']))

        plot_tokenizer.word_index = {e:i for e,i in plot_tokenizer.word_index.items() if i <= Keras_Configurations_model2.MAX_FEATURES}
        
        plot_tokenizer.word_index[plot_tokenizer.oov_token] = Keras_Configurations_model2.MAX_FEATURES + 1

        x_train.loc[:, 'plot_summary_seqs'] = plot_tokenizer.texts_to_sequences(x_train.loc[:, 'clean_plot_summary'])
        
        x_test.loc[:, 'plot_summary_seqs'] = plot_tokenizer.texts_to_sequences(x_test.loc[:, 'clean_plot_summary'])

        vocabulary_size_frequent_words = len(plot_tokenizer.word_index) + 1
        
        tokenizer = plot_tokenizer
        
    elif variable == "features":
        
        combined_features_tokenizer = Tokenizer(num_words=Keras_Configurations_model3.MAX_FEATURES, lower=True, split=' ', oov_token = '<OOV>')
        
        combined_features_tokenizer.fit_on_texts(list(x_train.loc[:, 'clean_combined_features']))

        combined_features_tokenizer.word_index = {e:i for e,i in combined_features_tokenizer.word_index.items() if i <= Keras_Configurations_model3.MAX_FEATURES}
        
        combined_features_tokenizer.word_index[combined_features_tokenizer.oov_token] = Keras_Configurations_model3.MAX_FEATURES + 1

        x_train.loc[:, 'combined_features_seqs'] = combined_features_tokenizer.texts_to_sequences(x_train.loc[:, 'clean_combined_features'])
        
        x_test.loc[:, 'combined_features_seqs'] = combined_features_tokenizer.texts_to_sequences(x_test.loc[:, 'clean_combined_features'])

        vocabulary_size_frequent_words = len(combined_features_tokenizer.word_index) + 1
        
        tokenizer = combined_features_tokenizer
        
    elif variable == "reviews":
        
        reviews_tokenizer = Tokenizer(num_words=40000, lower=True, split=' ', oov_token = '<OOV>')
        
        reviews_tokenizer.fit_on_texts(x_train.loc[:, 'clean_reviews'])

        reviews_tokenizer.word_index = {e:i for e,i in reviews_tokenizer.word_index.items() if i <= 40000}
        
        reviews_tokenizer.word_index[reviews_tokenizer.oov_token] = 40000 + 1

        x_train.loc[:, 'reviews_seqs'] = x_train.loc[:, 'clean_reviews'].apply(lambda x: reviews_tokenizer.texts_to_sequences(x))
        
        x_test.loc[:, 'reviews_seqs'] = x_test.loc[:, 'clean_reviews'].apply(lambda x: reviews_tokenizer.texts_to_sequences(x))

        vocabulary_size_frequent_words = len(reviews_tokenizer.word_index) + 1
        
        tokenizer = reviews_tokenizer
        
    return vocabulary_size_frequent_words, tokenizer

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

# Function 5

def padding_sequnce_length(variable, x_train):
    
    if variable == "actors":
    
        all_train_lengths =  list(x_train.actors_seqs.apply(len))

        maxlen = int(np.percentile(all_train_lengths, q=90))

        print('Max Length of the pad sequence for Actors: {}\n'.format(maxlen))
        
    elif variable == "plot":
        
        all_train_lengths = list(x_train.plot_summary_seqs.apply(len))

        maxlen = int(np.percentile(all_train_lengths, q=90))

        print('Max Length of each padding sequence for Plot summary text: {}\n'.format(maxlen))
        
    elif variable == "features":
        
        all_train_lengths =  list(x_train.combined_features_seqs.apply(len))

        maxlen = int(np.percentile(all_train_lengths, q=90))

        print('Max Length of each padding sequence for Movie features text: {}\n'.format(maxlen))
        
    elif variable == "reviews":
        
        all_train_lengths =  list(x_train.reviews_seqs.apply(len))

        maxlen = int(np.percentile(all_train_lengths, q=90))

        print('Max Length of each padding sequence for Reviews text: {}\n'.format(maxlen))
        
    return maxlen

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

# Function 6

# the input data for a deep learning model must be a single tensor (of shape e.g. (batch_size, 6, vocab_size) in this case), 
# samples that are shorter than the longest item need to be padded with some placeholder value.

#url https://www.tensorflow.org/guide/keras/masking_and_padding
def padding_sequence(variable, x_train, x_test, y_train, y_test, maxlen):
    
    if variable == "actors":
        
        x_train_seq = pad_sequences(x_train.loc[:, 'actors_seqs'], padding='post', maxlen=maxlen)
    
        x_test_seq = pad_sequences(x_test.loc[:, 'actors_seqs'], padding='post', maxlen=maxlen)
        
        #--------------------------------------------------------------------
        
        # Task 1: Discussed with Mr. Louridas
        
        #padded_shapes = ([100], [None]) # ([None],())
        
        #x_train_seq = x_train.values.shuffle(1000).padded_batch(32, padded_shapes = padded_shapes) # shuffle is the length of the longest string
        #x_test_seq = x_test.values.shuffle(1000).padded_batch(32, padded_shapes = padded_shapes)

        #--------------------------------------------------------------------
        
        assert len(x_train_seq) == len(y_train) # x_train_seq

        assert len(x_test_seq) == len(y_test) # x_test_seq
        
    elif variable == "plot":
        
        x_train_seq = pad_sequences(x_train.loc[:, 'plot_summary_seqs'], padding='post', maxlen=maxlen)
    
        x_test_seq = pad_sequences(x_test.loc[:, 'plot_summary_seqs'], padding='post', maxlen=maxlen)

        assert len(x_train_seq) == len(y_train)

        assert len(x_test_seq) == len(y_test)
        
    elif variable == "features":
        
        x_train_seq = pad_sequences(x_train.loc[:, 'combined_features_seqs'], padding='post', maxlen=maxlen)
    
        x_test_seq = pad_sequences(x_test.loc[:, 'combined_features_seqs'], padding='post', maxlen=maxlen)

        assert len(x_train_seq) == len(y_train)

        assert len(x_test_seq) == len(y_test)
        
    elif variable == "reviews":
        
        x_train_seq = pad_sequences(x_train.loc[:, 'reviews_seqs'], padding='post', maxlen=maxlen)
    
        x_test_seq = pad_sequences(x_test.loc[:, 'reviews_seqs'], padding='post', maxlen=maxlen)

        assert len(x_train_seq) == len(y_train)

        assert len(x_test_seq) == len(y_test)
        
    return x_train_seq, x_test_seq

In [15]:
%%time
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Transfrom the columns
print("---------------------------------------------------------------------------------\n")
print("Transfrom the column of the actors\n")
transform_actors("actors_unified", dataset_frequent_genres) # function 3: transform_actors

print("Transfrom the column of the plot summary\n")
transform_plot("plot", dataset_frequent_genres) # function 3: transform_plot

print("Transfrom the column of the movie features\n")
transform_features("movie_features", dataset_frequent_genres) # function 3: transform_features

print("Transfrom the column of the movie reviews\n")
transform_reviews("reviews_pruned", dataset_frequent_genres) # function 3: transform_reviews

---------------------------------------------------------------------------------

Transfrom the column of the actors

Transfrom the column of the plot summary

Transfrom the column of the movie features

Transfrom the column of the movie reviews

Wall time: 1min 17s


In [16]:
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Split the dataset into train & validation set
print("\n---------------------------------------------------------------------------------")
print("\nSplit the dataset into train & validation set\n")

X_train, X_test, y_train, y_test = split_dataset(dataset_frequent_genres.iloc[:, 13:30], dataset_frequent_genres) #13:30


---------------------------------------------------------------------------------

Split the dataset into train & validation set



In [17]:
print("X_train shape:{}".format(X_train.shape))
print("X_test shape:{}".format(X_test.shape))
print("y_train shape:{}".format(y_train.shape))
print("y_test shape:{}".format(y_test.shape))

X_train shape:(39298, 6)
X_test shape:(9825, 6)
y_train shape:(39298, 17)
y_test shape:(9825, 17)


#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#### Prune the most frequent genres (drama, comedy) - this is for the balancing the data

#### Check the frequency of each genre tag in TRAIN, TEST datasets and prune the high frequent genres to re-balance both train and test datasets

In [18]:
round(X_train['reduced_genres'].explode().value_counts(normalize=True) * 100,3)

Drama          25.280
Comedy         15.903
Action          7.566
Romance         7.319
Thriller        6.996
Horror          5.749
Crime           5.404
Documentary     4.353
Adventure       4.267
Sci-Fi          3.001
Mystery         2.722
Children        2.542
Animation       2.401
Fantasy         2.314
War             1.664
Western         1.346
Musical         1.172
Name: reduced_genres, dtype: float64

In [None]:
X_train.iloc[0:10]

In [None]:
# Re-balance genre Drama

X_train_drama = X_train[X_train["reduced_genres"].apply(lambda x: "Drama" in x)]
X_train_drama_out = X_train_drama.sample(frac=.85)
remove_indexes = X_train_drama_out.index
X_train_updated_version1 = X_train[~X_train.index.isin(remove_indexes)]
print("X_train updated shape: {}".format(X_train_updated_version1.shape))

y_train_updated_version1 = y_train[~y_train.index.isin(remove_indexes)]
print("y_train updated shape: {}".format(y_train_updated_version1.shape))

In [None]:
round(X_train_updated_version1['reduced_genres'].explode().value_counts(normalize=True) * 100,3)

In [None]:
# Re-balance genre Comedy

X_train_comedy = X_train_updated_version1[X_train_updated_version1["reduced_genres"].apply(lambda x: "Comedy" in x)]
X_train_comedy_out = X_train_comedy.sample(frac=.75)
remove_indexes = X_train_comedy_out.index
X_train_updated_version2 = X_train_updated_version1[~X_train_updated_version1.index.isin(remove_indexes)]
print("X_train updated shape: {}".format(X_train_updated_version2.shape))

y_train_updated_version2 = y_train_updated_version1[~y_train_updated_version1.index.isin(remove_indexes)]
print("y_train updated shape: {}".format(y_train_updated_version2.shape))

In [None]:
round(X_train_updated_version2['reduced_genres'].explode().value_counts(normalize=True) * 100,3)

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

In [None]:
round(X_test['reduced_genres'].explode().value_counts(normalize=True) * 100,3)

In [None]:
# Re-balance genre Drama

X_test_drama = X_test[X_test["reduced_genres"].apply(lambda x: "Drama" in x)]
X_test_drama_out = X_test_drama.sample(frac=.85)
remove_indexes = X_test_drama_out.index
X_test_updated_version1 = X_test[~X_test.index.isin(remove_indexes)]
X_test_updated_version1.shape
print("X_test updated shape: {}".format(X_test_updated_version1.shape))

y_test_updated_version1 = y_test[~y_test.index.isin(remove_indexes)]
print("y_test updated shape: {}".format(y_test_updated_version1.shape))

In [None]:
# Re-balance genre Comedy

X_test_comedy = X_test_updated_version1[X_test_updated_version1["reduced_genres"].apply(lambda x: "Comedy" in x)]
X_test_comedy_out = X_test_comedy.sample(frac=.75)
remove_indexes = X_test_comedy_out.index
X_test_updated_version2 = X_test_updated_version1[~X_test_updated_version1.index.isin(remove_indexes)]
print("X_test updated shape: {}".format(X_test_updated_version2.shape))

y_test_updated_version2 = y_test_updated_version1[~y_test_updated_version1.index.isin(remove_indexes)]
print("y_test updated shape: {}".format(y_test_updated_version2.shape))

In [None]:
round(X_test_updated_version2['reduced_genres'].explode().value_counts(normalize=True) * 100,3)

In [None]:
# After dropping the frequent genre tags (Drama & Comedy)

print("X_train shape:{}".format(X_train_updated_version2.shape))
print("X_test shape:{}".format(X_test_updated_version2.shape))
print("y_train shape:{}".format(y_train_updated_version2.shape))
print("y_test shape:{}".format(y_test_updated_version2.shape))

# These are the X and y datasets with 50-50 split and closely balanced genre tags.
# Even though balanced, the total number of data points reduced by 14,000 movies.

#--------------------------------

# These are the X and y datasets with 80-20 split and closely balanced genre tags.
# Even though balanced, the total number of data points reduced by 12,000 movies.

In [None]:
# Before dropping the frequent genre tags (Drama & Comedy)

print("X_train shape:{}".format(X_train.shape))
print("X_test shape:{}".format(X_test.shape))
print("y_train shape:{}".format(y_train.shape))
print("y_test shape:{}".format(y_test.shape))

#### End of data re-balancing
#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

In [19]:
%%time
# Separate each different input column (actors, plot, features)

# BECAREFUL: X_train = before balancing the data, X_train_updated_version2 is for the balanced data

X_train_actors = X_train[["title", "clean_actors", "reduced_genres"]]
X_train_plot = X_train[["title", "clean_plot_summary", "reduced_genres"]]
X_train_features = X_train[["title", "clean_combined_features", "reduced_genres"]]
X_train_reviews = X_train[["title", "clean_reviews", "reduced_genres"]]
# In X_train and X_test I also use columns "title" and "genres" since they will be both used later for making inference with predictions

assert X_train_actors.shape==X_train_plot.shape==X_train_features.shape==X_train_reviews.shape

X_test_actors = X_test[["title", "clean_actors", "reduced_genres"]]
X_test_plot = X_test[["title", "clean_plot_summary", "reduced_genres"]]
X_test_features = X_test[["title", "clean_combined_features", "reduced_genres"]]
X_test_reviews = X_test[["title", "clean_reviews", "reduced_genres"]]

assert X_test_actors.shape==X_test_plot.shape==X_test_features.shape==X_test_reviews.shape

Wall time: 15 ms


In [20]:
%time #(This block of code should be executed each time the split range changes (80-20-> 50-50))
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Tokenize the dataset (using the keras tokenizer)
print("\n---------------------------------------------------------------------------------")
print("\nTokenize the dataset (using the keras tokenizer)\n")

vocabulary_size_frequent_words_actors, tokenizer_actors = keras_tokenization("actors", X_train_actors, X_test_actors) # function 5: keras_tokenization
print("\nActors tokenized with maximum number of words: {}\n".format(vocabulary_size_frequent_words_actors))

vocabulary_size_frequent_words_plot, tokenizer_plot = keras_tokenization("plot", X_train_plot, X_test_plot) # function 5: keras_tokenization
print("\nPlot Summary tokenized with maximum number of words: {}\n".format(vocabulary_size_frequent_words_plot))

vocabulary_size_frequent_words_features, tokenizer_features = keras_tokenization("features", X_train_features, X_test_features) # function 5: keras_tokenization
print("\nMovie Features tokenized with maximum number of words: {}\n".format(vocabulary_size_frequent_words_features))

Wall time: 0 ns

---------------------------------------------------------------------------------

Tokenize the dataset (using the keras tokenizer)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s



Actors tokenized with maximum number of words: 20001


Plot Summary tokenized with maximum number of words: 17501


Movie Features tokenized with maximum number of words: 20001



In [21]:
%%time
vocabulary_size_frequent_words_reviews, tokenizer_reviews = keras_tokenization("reviews", X_train_reviews, X_test_reviews) # function 5: keras_tokenization
print("\nMovie Reviews tokenized with maximum number of words: {}\n".format(vocabulary_size_frequent_words_reviews))

# Done once and then was pickled...due to long time of completion!


Movie Reviews tokenized with maximum number of words: 40001

Wall time: 5min 55s


#### Comment: The three below blocks of code where executed once and then were pickled!

In [24]:
# X_train_reviews.loc[:, 'reviews_seqs'] = X_train_reviews.loc[:, 'reviews_seqs'].apply(lambda x: [[40001] if len(sublist)==0 else sublist for sublist in x])
# X_test_reviews.loc[:, 'reviews_seqs'] = X_test_reviews.loc[:, 'reviews_seqs'].apply(lambda x: [[40001] if len(sublist)==0 else sublist for sublist in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [25]:
# X_train_reviews.loc[:, 'reviews_seqs'] = X_train_reviews.loc[:, 'reviews_seqs'].apply(lambda x: list(itertools.chain.from_iterable(x)))
# X_test_reviews.loc[:, 'reviews_seqs'] = X_test_reviews.loc[:, 'reviews_seqs'].apply(lambda x: list(itertools.chain.from_iterable(x)))

In [None]:
#### PICKLE THE Reviews Tokenizer

# saving
# with open('reviews_tokenizer_06022020.pkl', 'wb') as handle:
#     pickle.dump(tokenizer_reviews, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# X_train_reviews.to_pickle("x_train_reviews_06022020.pkl")
# X_test_reviews.to_pickle("x_test_reviews_06022020.pkl")

In [None]:
#### Load THE Reviews Tokenizer and X_train_reviews, X_test_reviews

# loading
# with open('reviews_tokenizer_06022020.pkl', 'rb') as handle:
#     tokenizer = pickle.load(handle)

# X_train_reviews = pd.read_pickle("x_train_reviews_06022020.pkl")
# X_test_reviews = pd.read_pickle("x_test_reviews_06022020.pkl")

In [26]:
%%time

# BECAREFUL: y_train/y_test: BEFORE re-balancing the dataset, y_train_updated_version2/y_test_updated_version2: AFTER re-balancing the dataset

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Specify the length of the maxlen variable
print("\n---------------------------------------------------------------------------------")
print("\nSpecify the length of the maxlen variable (length is a parameter for the optimal padding execution)\n")

maxlen_actors = padding_sequnce_length("actors", X_train_actors) # function 6: padding_sequnce_length
maxlen_plot = padding_sequnce_length("plot", X_train_plot) # function 6: padding_sequnce_length
maxlen_features = padding_sequnce_length("features", X_train_features) # function 6: padding_sequnce_length
maxlen_reviews = padding_sequnce_length("reviews", X_train_reviews) # function 6: padding_sequnce_length

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Create the padding sequence of texts
print("\n---------------------------------------------------------------------------------")
print("\nCreate the padding sequence of texts\n")

X_train_seq_actors, X_test_seq_actors = padding_sequence("actors", X_train_actors, X_test_actors, y_train, y_test, maxlen_actors) # function 7: padding_sequence
print("\nActors padded sequences created\n")

X_train_seq_plot, X_test_seq_plot = padding_sequence("plot", X_train_plot, X_test_plot, y_train, y_test, maxlen_plot) # function 7: padding_sequence
print("Plot padded sequences created\n")

X_train_seq_features, X_test_seq_features = padding_sequence("features", X_train_features, X_test_features, y_train, y_test, maxlen_features) # function 7: padding_sequence
print("Movie Features padded sequences created\n")

X_train_seq_reviews, X_test_seq_reviews = padding_sequence("reviews", X_train_reviews, X_test_reviews, y_train, y_test, maxlen_reviews) # function 7: padding_sequence
print("Movie Reviews padded sequences created")


---------------------------------------------------------------------------------

Specify the length of the maxlen variable (length is a parameter for the optimal padding execution)

Max Length of the pad sequence for Actors: 17

Max Length of each padding sequence for Plot summary text: 23

Max Length of each padding sequence for Movie features text: 60

Max Length of each padding sequence for Reviews text: 2067


---------------------------------------------------------------------------------

Create the padding sequence of texts


Actors padded sequences created

Plot padded sequences created

Movie Features padded sequences created

Movie Reviews padded sequences created
Wall time: 3.5 s


#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#### #1st case of data: 80-20 split and non-balanced dataset!

* X_train & X_test with <b>80-20</b> split and <b>non-balanced genre</b> tags

In [27]:
print("X_train_seq_actors shape:{}".format(X_train_seq_actors.shape)) #80-20 split
print("X_train_seq_plot shape:{}".format(X_train_seq_plot.shape)) #80-20 split
print("X_train_seq_features shape:{}".format(X_train_seq_features.shape)) #80-20 split
print("X_train_seq_reviews shape:{}\n".format(X_train_seq_reviews.shape)) #80-20 split

print("X_test_seq_actors shape:{}".format(X_test_seq_actors.shape)) #80-20 split
print("X_test_seq_plot shape:{}".format(X_test_seq_plot.shape)) #80-20 split
print("X_test_seq_features shape:{}".format(X_test_seq_features.shape)) #80-20 split
print("X_test_seq_reviews shape:{}".format(X_test_seq_reviews.shape)) #80-20 split

X_train_seq_actors shape:(39298, 17)
X_train_seq_plot shape:(39298, 23)
X_train_seq_features shape:(39298, 60)
X_train_seq_reviews shape:(39298, 2067)

X_test_seq_actors shape:(9825, 17)
X_test_seq_plot shape:(9825, 23)
X_test_seq_features shape:(9825, 60)
X_test_seq_reviews shape:(9825, 2067)


* y_train & y_test with <b>80-20</b> split and <b>non-balanced genre</b> tags

In [28]:
print("y_train shape:{}".format(y_train.shape)) #80-20 split
print("y_test shape:{}".format(y_test.shape)) #80-20 split

y_train shape:(39298, 17)
y_test shape:(9825, 17)


In [29]:
np.save("x_train_seq_actors_80-20_non-balanced_07022020", X_train_seq_actors)
np.save("x_train_seq_plot_80-20_non-balanced_07022020", X_train_seq_plot)
np.save("x_train_seq_features_80-20_non-balanced_07022020", X_train_seq_features)
np.save("x_train_seq_reviews_80-20_non-balanced_07022020", X_train_seq_reviews)

np.save("x_test_seq_actors_80-20_non-balanced_07022020", X_test_seq_actors)
np.save("x_test_seq_plot_80-20_non-balanced_07022020", X_test_seq_plot)
np.save("x_test_seq_features_80-20_non-balanced_07022020", X_test_seq_features)
np.save("x_test_seq_reviews_80-20_non-balanced_07022020", X_test_seq_reviews)

np.save("y_train_80-20_non-balanced_07022020", y_train) #np.save: saves a multi-hot encoded dataframe as array!
np.save("y_test_80-20_non-balanced_07022020", y_test)

#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#### #2nd case of data: 50-50 split and non-balanced dataset!

* X_train & X_test with <b>50-50</b> split and <b>non-balanced genre</b> tags

In [None]:
print("X_train_seq_actors shape:{}".format(X_train_seq_actors.shape)) #50-50 split
print("X_train_seq_plot shape:{}".format(X_train_seq_plot.shape)) #50-50 split
print("X_train_seq_features shape:{}".format(X_train_seq_features.shape)) #50-50 split
print("X_train_seq_reviews shape:{}\n".format(X_train_seq_reviews.shape)) #50-50 split

print("X_test_seq_actors shape:{}".format(X_test_seq_actors.shape)) #50-50 split
print("X_test_seq_plot shape:{}".format(X_test_seq_plot.shape)) #50-50 split
print("X_test_seq_features shape:{}".format(X_test_seq_features.shape)) #50-50 split
print("X_test_seq_reviews shape:{}".format(X_test_seq_reviews.shape)) #50-50 split

* y_train & y_test with <b>50-50</b> split and <b>non-balanced genre</b> tags

In [None]:
print("y_train shape:{}".format(y_train.shape)) #50-50 split
print("y_test shape:{}".format(y_test.shape)) #50-50 split

In [None]:
np.save("x_train_seq_actors_50-50_non-balanced_07022020.pkl", X_train_seq_actors)
np.save("x_train_seq_plot_50-50_non-balanced_07022020.pkl", X_train_seq_plot)
np.save("x_train_seq_features_50-50_non-balanced_07022020.pkl", X_train_seq_features)
np.save("x_train_seq_reviews_50-50_non-balanced_07022020.pkl", X_train_seq_reviews)

np.save("x_test_seq_actors_50-50_non-balanced_07022020.pkl", X_test_seq_actors)
np.save("x_test_seq_plot_50-50_non-balanced_07022020.pkl", X_test_seq_plot)
np.save("x_test_seq_features_50-50_non-balanced_07022020.pkl", X_test_seq_features)
np.save("x_test_seq_reviews_50-50_non-balanced_07022020.pkl", X_test_seq_reviews)

np.save("y_train_50-50_non-balanced_07022020.pkl", y_train)
np.save("y_test_50-50_non-balanced_07022020.pkl", y_test)

#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#### #3rd case of data: 50-50 split and balanced dataset!

* X_train & X_test with <b>50-50</b> split and <b>balanced genre</b> tags

In [None]:
print("X_train_seq_actors shape:{}".format(X_train_seq_actors.shape)) #50-50 split, balanced genres
print("X_train_seq_plot shape:{}".format(X_train_seq_plot.shape)) #50-50 split, balanced genres
print("X_train_seq_features shape:{}".format(X_train_seq_features.shape)) #50-50 split, balanced genres
print("X_train_seq_reviews shape:{}\n".format(X_train_seq_reviews.shape)) #50-50 split, balanced genres

print("X_test_seq_actors shape:{}".format(X_test_seq_actors.shape)) #50-50 split, balanced genres
print("X_test_seq_plot shape:{}".format(X_test_seq_plot.shape)) #50-50 split, balanced genres
print("X_test_seq_features shape:{}".format(X_test_seq_features.shape)) #50-50 split, balanced genres
print("X_test_seq_reviews shape:{}".format(X_test_seq_reviews.shape)) #50-50 split, balanced genres

* y_train & y_test with <b>50-50</b> split and <b>balanced genre</b> tags

In [None]:
print("y_train shape:{}".format(y_train_updated_version2.shape)) #50-50 split, balanced genres
print("y_test shape:{}".format(y_test_updated_version2.shape)) #50-50 split, balanced genres

In [None]:
np.save("x_train_seq_actors_50-50_balanced_07022020.pkl", X_train_seq_actors)
np.save("x_train_seq_plot_50-50_balanced_07022020.pkl", X_train_seq_plot)
np.save("x_train_seq_features_50-50_balanced_07022020.pkl", X_train_seq_features)
np.save("x_train_seq_reviews_50-50_balanced_07022020.pkl", X_train_seq_reviews)

np.save("x_test_seq_actors_50-50_balanced_07022020.pkl", X_test_seq_actors)
np.save("x_test_seq_plot_50-50_balanced_07022020.pkl", X_test_seq_plot)
np.save("x_test_seq_features_50-50_balanced_07022020.pkl", X_test_seq_features)
np.save("x_test_seq_reviews_50-50_balanced_07022020.pkl", X_test_seq_reviews)

np.save("y_train_50-50_balanced_07022020.pkl", y_train_updated_version2)
np.save("y_test_50-50_balanced_07022020.pkl", y_test_updated_version2)

#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#### #4th case of data: 80-20 split and balanced dataset!

* X_train & X_test with <b>80-20</b> split and <b>balanced genre</b> tags

In [None]:
print("X_train_seq_actors shape:{}".format(X_train_seq_actors.shape)) #80-20 split, balanced genres
print("X_train_seq_plot shape:{}".format(X_train_seq_plot.shape)) #80-20 split, balanced genres
print("X_train_seq_features shape:{}".format(X_train_seq_features.shape)) #80-20 split, balanced genres
print("X_train_seq_reviews shape:{}\n".format(X_train_seq_reviews.shape)) #80-20 split, balanced genres

print("X_test_seq_actors shape:{}".format(X_test_seq_actors.shape)) #80-20 split, balanced genres
print("X_test_seq_plot shape:{}".format(X_test_seq_plot.shape)) #80-20 split, balanced genres
print("X_test_seq_features shape:{}".format(X_test_seq_features.shape)) #80-20 split, balanced genres
print("X_test_seq_reviews shape:{}".format(X_test_seq_reviews.shape)) #80-20 split, balanced genres

* y_train & y_test with <b>80-0</b> split and <b>balanced genre</b> tags

In [None]:
print("y_train shape:{}".format(y_train_updated_version2.shape)) #80-20 split, balanced genres
print("y_test shape:{}".format(y_test_updated_version2.shape)) #80-20 split, balanced genres

In [None]:
np.save("x_train_seq_actors_80-20_balanced_07022020.pkl", X_train_seq_actors)
np.save("x_train_seq_plot_80-20_balanced_07022020.pkl", X_train_seq_plot)
np.save("x_train_seq_features_80-20_balanced_07022020.pkl", X_train_seq_features)
np.save("x_train_seq_reviews_80-20_balanced_07022020.pkl", X_train_seq_reviews)

np.save("x_test_seq_actors_80-20_balanced_07022020.pkl", X_test_seq_actors)
np.save("x_test_seq_plot_80-20_balanced_07022020.pkl", X_test_seq_plot)
np.save("x_test_seq_features_80-20_balanced_07022020.pkl", X_test_seq_features)
np.save("x_test_seq_reviews_80-20_balanced_07022020.pkl", X_test_seq_reviews)

np.save("y_train_80-20_balanced_07022020.pkl", y_train_updated_version2)
np.save("y_test_80-20_balanced_07022020.pkl", y_test_updated_version2)

#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#### Comment: the below two pieces of code would not be executed (deleted in newer version)

In [None]:
import json

label_result = !curl -s -d "text='Mel Brooks's scattergun approach to comedy has a number of misses. Spaceballs was OK at parodying its genre. This film is far more sophisticated and well played.The successful jokes are on the culture of Victorain times with references to an engaged couple who after 10 years have suddenly held hands being condemned as immoral, prostitutes, lechers and the like. Into these cultural and successful observations Brook's introduces Leslie Nielson doing a great impression of Bela Lugosi's Dracula with the difference that his powers are incompetent.Seeing the Lugosi movie will give you the basis to appreciate the sophistication of this film'" http://text-processing.com/api/sentiment/
print(label_result)

y = json.loads(label_result[0])
y['label']

In [None]:
from tqdm import tqdm_notebook
sentiment_list=[]
for i in tqdm_notebook(range(len(dataset_frequent_genres.reviews_pruned))):
    label_result = !curl -s -d "text={}".format(dataset_frequent_genres.reviews_pruned.iloc[i]) http://text-processing.com/api/sentiment/
    y = json.loads(label_result[0])
    sentiment_list.append(y['label'])

#### THIS IS THE END OF PART 3.1 - Where tokenization, cleaning and balancing of the data took place.
#### The next PART 3.2, focuses on training & validating different models neural models based on the data prepaired on PART 3.1