### Part 3.1 - Data Tokenization-Transformation (latest changes on 09.03.2020)

#### Import the libraries

In [None]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from tabulate import tabulate
import re
import os

import random

# Module to serialize the content produced from the execution of the code

import pickle

# Module to monitor the progress of a python for loop

from tqdm import tqdm_notebook

# Module to manipulate text in python - NLTK package

import nltk
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag

# Module to compute word vectorizers and compute the cosine distance

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.feature_extraction import text

import string
import itertools

from scipy import stats

# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:70% !important; }</style>"))

#### Keras Libraries

In [None]:
from nltk.stem import WordNetLemmatizer

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from skmultilearn.model_selection import iterative_train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

from time import time

#--------------------------------------------------------------

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras import layers
from tensorflow.keras import models

from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

#---------------------------------------------------------------

%matplotlib inline
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
from pylab import rcParams

import pydot
import pydotplus
import graphviz

from IPython.display import SVG
from tensorflow.keras.utils import model_to_dot

from tensorflow.keras.models import load_model
from tensorflow.keras.models import model_from_json
import json

# Import ML FLow
import mlflow.tensorflow
import mlflow.pyfunc
from tensorflow.keras import regularizers
import datetime

# Import TensorBoard
import tensorflow_docs as tfdocs
import tensorflow_docs.plots as tfplots
import tensorflow_docs.modeling as tfmodel
from tensorflow.keras import regularizers
# from tensorboard import default
# from tensorboard import program

import tensorflow_hub as hub
import bert
from bert import tokenization
from bert.tokenization import FullTokenizer

#Visualize Model

def visualize_model(model):
    return SVG(model_to_dot(model, show_shapes= True, show_layer_names=True, dpi=65).create(prog='dot', format='svg'))

from tensorflow.keras.utils import plot_model

from packaging import version

print("TensorFlow version: ", tf.__version__)
assert version.parse(tf.__version__).release[0] >= 2, \
    "This notebook requires TensorFlow 2.0 or above."

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

#### Import the dataset (this demonstrates how the genres have been cleaned)

In [None]:
dataset= pd.read_pickle(os.path.join(os.getcwd(), 'pickled_data_per_part\\dataset_part_2_16022020.pkl'))

print("\nThe shape of the dataset that will be used in Keras classifier is: {}".format(dataset.shape))

#### -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

#### Check the correlation between user ratings and IMDB ratings

In [None]:
k2, p = stats.normaltest(dataset.rating)
print("p = {:g}".format(p))

alpha = 0.05
if p < alpha:  # null hypothesis: x comes from a normal distribution
    print("The null hypothesis can be rejected")
else:
    print("The null hypothesis cannot be rejected")

In [None]:
from scipy import stats
dataset['imdb_rating'] = dataset['imdb_rating'].astype(float)

k2, p = stats.normaltest(dataset.imdb_rating)
print("p = {:g}".format(p))

alpha = 0.05
if p < alpha:  # null hypothesis: x comes from a normal distribution
    print("The null hypothesis can be rejected")
else:
    print("The null hypothesis cannot be rejected")

In [None]:
from scipy.stats import spearmanr
rho, pval = spearmanr(dataset.rating,dataset.imdb_rating)
rho

In [None]:
import seaborn as sns
sns.scatterplot('rating','imdb_rating',data=dataset)
plt.title('user rating vs IMDB rating', fontsize=18)
plt.ylabel('IMDB rating', fontsize=16)
plt.xlabel('user rating', fontsize=16)

#### -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

#### Understand the dependent variable: Genres of each movie

Check their frequency distribution

In [None]:
dataset['genres'].explode().value_counts()

In [None]:
round(dataset['genres'].explode().value_counts(normalize=True) * 100,3)

In [None]:
# STEP 1: Remove genres less than 1% frequency

dataset['reduced_genres'] = dataset['genres'].apply(
    lambda row: [val for val in row if val not in ['IMAX', 'Sport', 'Adult', 'News', 'Reality-TV',
                                                   'Film-Noir', 'Short', 'Family', 'Biography', 'Music', 'History']])

In [None]:
dataset['reduced_genres'].shape

In [None]:
# STEP 2: Find indexes with EMPTY LISTST

dataset_empty_lists = dataset[dataset.reduced_genres.apply(lambda c: c==[])]

remove_indices = dataset_empty_lists.index.to_list()

dataset_empty_lists

In [None]:
# STEP 3: Remove the indexes with EMPTY LISTS

dataset_frequent_genres =  dataset[~dataset.index.isin(remove_indices)]

dataset_frequent_genres.shape

In [None]:
dataset_frequent_genres = dataset_frequent_genres.reset_index(drop=True)

In [None]:
"""
Having cut the most scarse occurences of genres it is still obvious that genres "Drama" & "Comedy" belong to 40% of the movies.
A good approach is either to up-sample the dataset or down-sample it.
What we chose was to down-sample the two dominant genres "Drama" & "Comedy". However, in the sub-part 3.2 
we use the imbalanced dataset to train and test the keras text classification models.
"""
round(dataset_frequent_genres['reduced_genres'].explode().value_counts(normalize=True) * 100,3)

In [None]:
"""
The dataset below contains 17 out of 27 genres. The 11 genres cut were not frequent enough compared to the rest of the genres.
"""
dataset_frequent_genres.to_pickle("dataset_part_2_cleaned_of_redundant_genres_16022020.pkl")

#### -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

#### #1 Import cleaned of redundant genres dataset and genres_list

In [None]:
# import dataset
dataset_frequent_genres = pd.read_pickle(os.path.join(os.getcwd(), 'pickled_data_per_part\\dataset_part_2_cleaned_of_redundant_genres_16022020.pkl'))

print("\nThe shape of the dataset that will be used in Keras classifier is: {}".format(dataset_frequent_genres.shape))
# Comment: From now on, "reduced_genres" column will be used for model classification and predictions.

In [None]:
"""
Multi-hot encoding is a good practice to transform the value y into a data structure appropriate for multi-label text calssification.
"""
# Multy hot encoding since a Movie can have more than 1 genres assigned!

mlb = MultiLabelBinarizer()
dataset_frequent_genres = dataset_frequent_genres.join(pd.DataFrame(mlb.fit_transform(dataset_frequent_genres['reduced_genres']),
                                                                    columns=mlb.classes_,
                                                                    index=dataset_frequent_genres.index))

In [None]:
# import genres
with open(os.path.join(os.getcwd(), "pickled_data_per_part\\genres_list_06032020.pkl"), 'rb') as handle:
    genres_list = pickle.load(handle)
genres_list

#### #2 Prune the movie reviews (keep only the first review for each movie)

In [None]:
dataset_frequent_genres['reviews_length'] = dataset_frequent_genres.reviews.apply(lambda x: len(x))

In [None]:
len(dataset_frequent_genres['reviews_length'][dataset_frequent_genres['reviews_length']==1])

# Since I don't want to loose 3326 movies, I will keep only the first review for each movie.

In [None]:
dataset_frequent_genres.loc[:, 'reviews_pruned'] = dataset_frequent_genres.reviews.apply(lambda x: x[0])

In [None]:
"""
We observed that a plain text of a reviews as such of a plot summary, contain a lot of stop-words, punctuations and "noisy" words
that could spoil the results of a text classification model.
"""
print("Raw text of a movie review:", dataset_frequent_genres.reviews_pruned.iloc[7])
print('\n')
print("Raw text of a plot summary: ", dataset_frequent_genres['plot'].iloc[7])

In [None]:
# table = str.maketrans(dict.fromkeys(string.punctuation))
# dataset_frequent_genres.loc[:, 'reviews_pruned'] = dataset_frequent_genres.loc[:, 'reviews_pruned'].apply(lambda x: x.translate(table))

#### -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

#### #3 Unify (join) the columns of Actors and Reviews in order to achive a dataframe cell with a unique TEXT (corpus) and not a LIST of texts

In [None]:
# Function 1: Actors
def unify_actors(row):
    return ','.join(row['actors']).strip()

# Function 2: Reviews
def unify_reviews(row):
    return ', '.join(row['reviews'])

In [None]:
dataset_frequent_genres['actors_unified'] = dataset_frequent_genres.apply(unify_actors, axis=1)
dataset_frequent_genres['reviews_unified'] = dataset_frequent_genres.apply(unify_reviews, axis=1)

print("Actors before: {}".format(dataset_frequent_genres.actors.iloc[0]))
print("Actors after: {}\n".format(dataset_frequent_genres.actors_unified.iloc[0]))

print("Reviews before: {}".format(dataset_frequent_genres.reviews.iloc[0]))
print("Reviews after: {}".format(dataset_frequent_genres.reviews_unified.iloc[0]))

#### #4 Functions

In [None]:
"""
Functions used across the whole notebook.
Those functions are explisetely used to pre-process the raw data input of texts
"""

# Function 1

def inference_function(indx, model, x_test_seq, x_test, genres_list):
    
    test_sequence = x_test_seq[indx:indx+1]
    
    text_prediction = model.predict(test_sequence)
    
    [float(i) for i in text_prediction[0]]
    
    tag_probabilities = text_prediction[0][np.argsort(text_prediction[0])[-3:]]
    
    indexes = np.argsort(text_prediction[0])[::-1][:3]

    predicted_tags = []
    
    for i, tag in enumerate(genres_list):
        if i in indexes:
            predicted_tags.append(genres_list[i])
    
    return print('\n\nMovie Title: {}'.format(x_test['Movie Title'].iloc[indx]), '\n\nPredicted Genre labels: {}'.format(predicted_tags), '\n\nWith predicted probabilities: {}'.format(tag_probabilities), '\n\nThe actual Genre labels: {}'.format(x_test['Genres'].iloc[indx]), "\n\n", "---------------------------------------------------------------------------------------------------------")

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

# Function 2

# version 2.1 (this version was used until 21.02.2020)
# def preprocess_text(text):
    
#     stop_words = set(stopwords.words('english'))
    
#     lemmatizer = WordNetLemmatizer()
    
#     no_stopword_text = [word for word in text.split(' ') if not word in stop_words]
    
#     lemmatized_text = [lemmatizer.lemmatize(word, pos='v') for word in no_stopword_text]
    
#     lowercase_text = [word.lower() for word in lemmatized_text]
    
#     return ' '.join(lowercase_text)

# version 2.2 (this version is an alternative approach of version 2.1, created on 22.02.2020)

def preprocess_text(raw_text):
    
    re_punc=re.compile('[%s]' % re.escape(string.punctuation))
    
    stripped=[re_punc.sub('', w) for w in raw_text.split(' ')]
    
    stripped=[token for token in stripped if token.isalpha()]
    
    #------------------------------------------------
    
    stop_words=text.ENGLISH_STOP_WORDS.union(["book"])
    
    no_stopword_text=[word for word in stripped if not word.lower() in stop_words]
    
    no_stopword_text = ' '.join(no_stopword_text) #i joined the text once more because a new lemmatizing approach is implemented below
    
    #------------------------------------------------
    
    lemmatizer = WordNetLemmatizer()
    
    #approach 1: lemmatized_text = [lemmatizer.lemmatize(word, pos='v') for word in stripped]
    #approach 1 was used until 21.02.2020, although we observed that only some of the tokens were lemmatized while others not.
    #Thus, we developed an alternative approach like below to lemmatize as many tokens/words as possible
    
    #approach 2 developed on 22.02.2020:
    lemmatized_text = [lemmatizer.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else lemmatizer.lemmatize(i) for i,j in pos_tag(word_tokenize(no_stopword_text))]
    
    #------------------------------------------------
    
    lowercase_text = [word.lower() for word in lemmatized_text]
    
    return ' '.join(lowercase_text)

def transform_actors(column_name, dataset):

    dataset.loc[:, 'clean_actors'] = dataset.loc[:, column_name].apply(lambda x: x.lower()) #if column "actors_unified" is used

def transform_plot(column_name, dataset):
    
    dataset.loc[:, 'clean_plot_summary'] = dataset.loc[:, column_name].apply(lambda x: preprocess_text(x))

def transform_features(column_name, dataset):
    
    dataset.loc[:, 'clean_combined_features'] = dataset.loc[:, column_name].apply(lambda x: preprocess_text(x))
    
def transform_reviews(column_name, dataset):
    
    dataset.loc[:, 'clean_reviews'] = dataset.loc[:, column_name].apply(lambda x: preprocess_text(x))

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

#Function 3.1

def split_dataset(method, labels, dataset, split_ratio):
    
    """
    Random shuffle split, with an option to split it into a stratified manner.
    However, when the stratified method was tested it didn't work out.
    
    Thus, we created a second function using the StratifiedShuffleSplit of the sklearn module.
    """
    
    #As mentioned earler "reduced genres" are now used and NOT the column "genres"
    X = dataset[['title', 'clean_actors', 'clean_plot_summary', 'clean_combined_features', 'clean_reviews', 'reduced_genres']]
    
    y = labels
    
    if method=="stratified":
    
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio, random_state=552, shuffle= True, stratify=y)
    
    else:
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio, random_state=552, shuffle= True)

    return X_train, X_test, y_train, y_test

#Function 3.2

def stratify_split_train_test(number_of_splits, split_ratio, dataset, labels):
    
    """
    parameters: number_of_splits: Number of re-shuffling & splitting iterations. (based on module documentation)
                split_ratio: If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. 
                             If int, represents the absolute number of test samples.
                dataset: The dataset with the transformed ("cleaned") inputs, title and genres.
                labels: The genre tags for each movie.
    
    output: The X_train, X_test, y_train, y_test splitted in stratified manner.
    """
    
    test_sss = StratifiedShuffleSplit(n_splits=number_of_splits,
                                      test_size=split_ratio,
                                      random_state=123)
    
    X = dataset[['title', 'clean_actors', 'clean_plot_summary', 'clean_combined_features', 'clean_reviews', 'reduced_genres']]
    
    y = labels
    
    # splitting in train-val and test

    X_train_val, X_test, y_train_val, y_test = None, None, None, None

    # getting the indexes for each dataset
    for train_index, test_index in test_sss.split(X, y):
    
        print("TRAIN-VAL data (indexes selected):", train_index[:10], "TEST data (indexes selected):", test_index[:10])
        
        X_train_val, X_test = X[train_index], X[test_index]
        y_train_val, y_test = y[train_index], y[test_index]
    
    return X_train_val, X_test, y_train_val, y_test

#Function 3.3

def stratify_split_train_validation(number_of_splits, split_ratio, X_train_val, y_train_val):
    
    """
    parameters: number_of_splits: Number of re-shuffling & splitting iterations. (based on module documentation)
                split_ratio: If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. 
                             If int, represents the absolute number of test samples.
                dataset: The dataset with the transformed ("cleaned") inputs, title and genres.
                labels: The genre tags for each movie.
    
    output: The X_train, X_test, y_train, y_test splitted in stratified manner.
    """
    
    val_sss  = StratifiedShuffleSplit(n_splits=number_of_splits,
                                      test_size=split_ratio,
                                      random_state=123)
    
    # We reset the indexes for bot the X-train-val and y-train-val in order to break them again into two subsets.
    X_train_val = X_train_val.reset_index(drop=True)
    y_train_val = y_train_val.reset_index(drop=True)
    
    # splitting in train-val and test

    X_train, X_val, y_train, y_val = None, None, None, None

    # getting the indexes for each dataset
    for train_index, val_index in val_sss.split(X_train_val, y_train_val):
    
        print("TRAIN data (indexes selected):", train_index[:10], "VALIDATION data (indexes selected):", val_index[:10])

        X_train, X_val = X_train_val[train_index], X_train_val[val_index]
        y_train, y_val = y_train_val[train_index], y_train_val[val_index]
    
    return X_train, X_val, y_train, y_val

#Function 3.4

def iterative_split_dataset(dataset, labels, split_ratio):
    
    """
    Iterative shuffle split, with an option to split it into a stratified manner.
    The method used is part of the skmultilearn module
    """
    
    #As mentioned earler "reduced genres" are now used and NOT the column "genres"
    #X = dataset[['title', 'clean_actors', 'clean_plot_summary', 'clean_combined_features', 'clean_reviews', 'reduced_genres']]
    X = dataset.loc[:, ['title', 'clean_actors', 'clean_plot_summary', 'clean_combined_features', 'clean_reviews', 'reduced_genres']].values
    y = labels
    
    X_train, X_test, y_train, y_test = iterative_train_test_split(X, y, test_size=split_ratio)

    return X_train, X_test, y_train, y_test

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# Function 4.1

def keras_tokenization(variable, maximum_words, x_train, x_test):
    
    if variable == "actors":  #old maximum_words=20000

        actors_tokenizer = Tokenizer(num_words=maximum_words, lower=True, split=', ', oov_token = '<OOV>')

        actors_tokenizer.fit_on_texts(list(x_train.loc[:, 'clean_actors']))

        actors_tokenizer.word_index = {e:i for e,i in actors_tokenizer.word_index.items() if i <= maximum_words}

        actors_tokenizer.word_index[actors_tokenizer.oov_token] = maximum_words + 1

        x_train.loc[:, 'actors_seqs'] = actors_tokenizer.texts_to_sequences(x_train.loc[:, 'clean_actors'])
    
        x_test.loc[:, 'actors_seqs'] = actors_tokenizer.texts_to_sequences(x_test.loc[:, 'clean_actors'])

        vocabulary_size_frequent_words = len(actors_tokenizer.word_index) + 1
        
        tokenizer = actors_tokenizer
            
    elif variable == "plot": #old maximum_words=17500
        
        plot_tokenizer = Tokenizer(num_words=maximum_words, lower=True, split=' ', oov_token = '<OOV>')
        
        plot_tokenizer.fit_on_texts(list(x_train.loc[:, 'clean_plot_summary']))

        plot_tokenizer.word_index = {e:i for e,i in plot_tokenizer.word_index.items() if i <= maximum_words}
        
        plot_tokenizer.word_index[plot_tokenizer.oov_token] = maximum_words + 1

        x_train.loc[:, 'plot_summary_seqs'] = plot_tokenizer.texts_to_sequences(x_train.loc[:, 'clean_plot_summary'])
        
        x_test.loc[:, 'plot_summary_seqs'] = plot_tokenizer.texts_to_sequences(x_test.loc[:, 'clean_plot_summary'])

        vocabulary_size_frequent_words = len(plot_tokenizer.word_index) + 1
        
        tokenizer = plot_tokenizer
        
    elif variable == "features": #old maximum_words=20000
        
        combined_features_tokenizer = Tokenizer(num_words=maximum_words, lower=True, split=' ', oov_token = '<OOV>')
        
        combined_features_tokenizer.fit_on_texts(list(x_train.loc[:, 'clean_combined_features']))

        combined_features_tokenizer.word_index = {e:i for e,i in combined_features_tokenizer.word_index.items() if i <= maximum_words}
        
        combined_features_tokenizer.word_index[combined_features_tokenizer.oov_token] = maximum_words + 1

        x_train.loc[:, 'combined_features_seqs'] = combined_features_tokenizer.texts_to_sequences(x_train.loc[:, 'clean_combined_features'])
        
        x_test.loc[:, 'combined_features_seqs'] = combined_features_tokenizer.texts_to_sequences(x_test.loc[:, 'clean_combined_features'])

        vocabulary_size_frequent_words = len(combined_features_tokenizer.word_index) + 1
        
        tokenizer = combined_features_tokenizer
        
    elif variable == "reviews": #old maximum_words=20000
        
        reviews_tokenizer = Tokenizer(num_words=maximum_words, lower=True, split=' ', oov_token = '<OOV>')
        
        reviews_tokenizer.fit_on_texts(x_train.loc[:, 'clean_reviews'])

        reviews_tokenizer.word_index = {e:i for e,i in reviews_tokenizer.word_index.items() if i <= maximum_words}
        
        reviews_tokenizer.word_index[reviews_tokenizer.oov_token] = maximum_words + 1

        x_train.loc[:, 'reviews_seqs'] = reviews_tokenizer.texts_to_sequences(x_train.loc[:, 'clean_reviews'])
        
        x_test.loc[:, 'reviews_seqs'] = reviews_tokenizer.texts_to_sequences(x_test.loc[:, 'clean_reviews'])

        vocabulary_size_frequent_words = len(reviews_tokenizer.word_index) + 1
        
        tokenizer = reviews_tokenizer
        
    return vocabulary_size_frequent_words, tokenizer

# Function 4.2: For Kfold cross validation
def keras_tokenization_cv(variable, maximum_words, x_data):
    
    if variable == "actors":  #old maximum_words=20000

        actors_tokenizer = Tokenizer(num_words=maximum_words, lower=True, split=',', oov_token = '<OOV>')

        actors_tokenizer.fit_on_texts(list(x_data.loc[:, 'clean_actors']))

        actors_tokenizer.word_index = {e:i for e,i in actors_tokenizer.word_index.items() if i <= maximum_words}

        actors_tokenizer.word_index[actors_tokenizer.oov_token] = maximum_words + 1

        x_data.loc[:, 'actors_seqs'] = actors_tokenizer.texts_to_sequences(x_data.loc[:, 'clean_actors'])

        vocabulary_size_frequent_words = len(actors_tokenizer.word_index) + 1
        
        tokenizer = actors_tokenizer
            
    elif variable == "plot": #old maximum_words=17500
        
        plot_tokenizer = Tokenizer(num_words=maximum_words, lower=True, split=' ', oov_token = '<OOV>')
        
        plot_tokenizer.fit_on_texts(list(x_data.loc[:, 'clean_plot_summary']))

        plot_tokenizer.word_index = {e:i for e,i in plot_tokenizer.word_index.items() if i <= maximum_words}
        
        plot_tokenizer.word_index[plot_tokenizer.oov_token] = maximum_words + 1

        x_data.loc[:, 'plot_summary_seqs'] = plot_tokenizer.texts_to_sequences(x_data.loc[:, 'clean_plot_summary'])

        vocabulary_size_frequent_words = len(plot_tokenizer.word_index) + 1
        
        tokenizer = plot_tokenizer
        
    elif variable == "features": #old maximum_words=20000
        
        combined_features_tokenizer = Tokenizer(num_words=maximum_words, lower=True, split=' ', oov_token = '<OOV>')
        
        combined_features_tokenizer.fit_on_texts(list(x_data.loc[:, 'clean_combined_features']))

        combined_features_tokenizer.word_index = {e:i for e,i in combined_features_tokenizer.word_index.items() if i <= maximum_words}
        
        combined_features_tokenizer.word_index[combined_features_tokenizer.oov_token] = maximum_words + 1

        x_data.loc[:, 'combined_features_seqs'] = combined_features_tokenizer.texts_to_sequences(x_data.loc[:, 'clean_combined_features'])
        
        vocabulary_size_frequent_words = len(combined_features_tokenizer.word_index) + 1
        
        tokenizer = combined_features_tokenizer
        
    elif variable == "reviews": #old maximum_words=20000
        
        reviews_tokenizer = Tokenizer(num_words=maximum_words, lower=True, split=' ', oov_token = '<OOV>')
        
        reviews_tokenizer.fit_on_texts(x_data.loc[:, 'clean_reviews'])

        reviews_tokenizer.word_index = {e:i for e,i in reviews_tokenizer.word_index.items() if i <= maximum_words}
        
        reviews_tokenizer.word_index[reviews_tokenizer.oov_token] = maximum_words + 1
        
        x_data.loc[:, 'reviews_seqs'] = reviews_tokenizer.texts_to_sequences(x_data.loc[:, 'clean_reviews'])
        
        vocabulary_size_frequent_words = len(reviews_tokenizer.word_index) + 1
        
        tokenizer = reviews_tokenizer
        
    return vocabulary_size_frequent_words, tokenizer

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

# Function 5.1

def padding_sequnce_length(variable, x_train):
    
    if variable == "actors":
    
        all_train_lengths =  list(x_train.actors_seqs.apply(len))

        maxlen = int(np.percentile(all_train_lengths, q=90))

        print('Max Length of the pad sequence for Actors: {}\n'.format(maxlen))
        
    elif variable == "plot":
        
        all_train_lengths = list(x_train.plot_summary_seqs.apply(len))

        maxlen = int(np.percentile(all_train_lengths, q=90))

        print('Max Length of each padding sequence for Plot summary text: {}\n'.format(maxlen))
        
    elif variable == "features":
        
        all_train_lengths =  list(x_train.combined_features_seqs.apply(len))

        maxlen = int(np.percentile(all_train_lengths, q=90))

        print('Max Length of each padding sequence for Movie features text: {}\n'.format(maxlen))
        
    elif variable == "reviews":
        
        all_train_lengths =  list(x_train.reviews_seqs.apply(len))

        maxlen = int(np.percentile(all_train_lengths, q=90))

        print('Max Length of each padding sequence for Reviews text: {}\n'.format(maxlen))
        
    return maxlen

# Function 5.2

def padding_sequnce_length_cv(variable, x_data):
    
    if variable == "actors":
    
        all_train_lengths =  list(x_data.actors_seqs.apply(len))

        maxlen = int(np.percentile(all_train_lengths, q=90))

        print('Max Length of the pad sequence for Actors: {}\n'.format(maxlen))
        
    elif variable == "plot":
        
        all_train_lengths = list(x_data.plot_summary_seqs.apply(len))

        maxlen = int(np.percentile(all_train_lengths, q=90))

        print('Max Length of each padding sequence for Plot summary text: {}\n'.format(maxlen))
        
    elif variable == "features":
        
        all_train_lengths =  list(x_data.combined_features_seqs.apply(len))

        maxlen = int(np.percentile(all_train_lengths, q=90))

        print('Max Length of each padding sequence for Movie features text: {}\n'.format(maxlen))
        
    elif variable == "reviews":
        
        all_train_lengths =  list(x_data.reviews_seqs.apply(len))

        maxlen = int(np.percentile(all_train_lengths, q=90))

        print('Max Length of each padding sequence for Reviews text: {}\n'.format(maxlen))
        
    return maxlen

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

# Function 6.1

# the input data for a deep learning model must be a single tensor (of shape e.g. (batch_size, 6, vocab_size) in this case), 
# samples that are shorter than the longest item need to be padded with some placeholder value.

#url https://www.tensorflow.org/guide/keras/masking_and_padding
def padding_sequence(variable, x_train, x_test, y_train, y_test, maxlen):
    
    if variable == "actors":
        
        x_train_seq = pad_sequences(x_train.loc[:, 'actors_seqs'], padding='post', maxlen=maxlen)
    
        x_test_seq = pad_sequences(x_test.loc[:, 'actors_seqs'], padding='post', maxlen=maxlen)
        
        #--------------------------------------------------------------------
        
        # Task 1: Discussed with Mr. Louridas
        
        #padded_shapes = ([100], [None]) # ([None],())
        
        #x_train_seq = x_train.values.shuffle(1000).padded_batch(32, padded_shapes = padded_shapes) # shuffle is the length of the longest string
        #x_test_seq = x_test.values.shuffle(1000).padded_batch(32, padded_shapes = padded_shapes)

        #--------------------------------------------------------------------
        
        assert len(x_train_seq) == len(y_train) # x_train_seq

        assert len(x_test_seq) == len(y_test) # x_test_seq
        
    elif variable == "plot":
        
        x_train_seq = pad_sequences(x_train.loc[:, 'plot_summary_seqs'], padding='post', maxlen=maxlen)
    
        x_test_seq = pad_sequences(x_test.loc[:, 'plot_summary_seqs'], padding='post', maxlen=maxlen)

        assert len(x_train_seq) == len(y_train)

        assert len(x_test_seq) == len(y_test)
        
    elif variable == "features":
        
        x_train_seq = pad_sequences(x_train.loc[:, 'combined_features_seqs'], padding='post', maxlen=maxlen)
    
        x_test_seq = pad_sequences(x_test.loc[:, 'combined_features_seqs'], padding='post', maxlen=maxlen)

        assert len(x_train_seq) == len(y_train)

        assert len(x_test_seq) == len(y_test)
        
    elif variable == "reviews":
        
        x_train_seq = pad_sequences(x_train.loc[:, 'reviews_seqs'], padding='post', maxlen=maxlen)
    
        x_test_seq = pad_sequences(x_test.loc[:, 'reviews_seqs'], padding='post', maxlen=maxlen)

        assert len(x_train_seq) == len(y_train)

        assert len(x_test_seq) == len(y_test)
        
    return x_train_seq, x_test_seq

# Function 6.2

def padding_sequence_cv(variable, x_data, y_data, maxlen):
    
    if variable == "actors":
        
        x_data_seq = pad_sequences(x_data.loc[:, 'actors_seqs'], padding='post', maxlen=maxlen)
    
        #--------------------------------------------------------------------
        
        # Task 1: Discussed with Mr. Louridas
        
        #padded_shapes = ([100], [None]) # ([None],())
        
        #x_train_seq = x_train.values.shuffle(1000).padded_batch(32, padded_shapes = padded_shapes) # shuffle is the length of the longest string
        #x_test_seq = x_test.values.shuffle(1000).padded_batch(32, padded_shapes = padded_shapes)

        #--------------------------------------------------------------------
        
        assert len(x_data_seq) == len(y_data)
        
    elif variable == "plot":
        
        x_data_seq = pad_sequences(x_data.loc[:, 'plot_summary_seqs'], padding='post', maxlen=maxlen)

        assert len(x_data_seq) == len(y_data)
        
    elif variable == "features":
        
        x_data_seq = pad_sequences(x_data.loc[:, 'combined_features_seqs'], padding='post', maxlen=maxlen)

        assert len(x_data_seq) == len(y_data)
        
    elif variable == "reviews":
        
        x_data_seq = pad_sequences(x_data.loc[:, 'reviews_seqs'], padding='post', maxlen=maxlen)

        assert len(x_data_seq) == len(y_data)
        
    return x_data_seq

In [None]:
"""
Previously we experinced an error using the stratified sampling. Below we printed the number of genre sequences that are assigned to only one movie.
For those 131 movies the stratified sampling is failing to complete.
Thus, we should find their indexes and remove them. The final dataset should contain 49123-131=48992
"""
print("Number of movies that are assigned to only 1 sequence of genres: ", len(dataset_frequent_genres.reduced_genres.value_counts()[dataset_frequent_genres.reduced_genres.value_counts()==1]), '\n')
list_of_movies_to_remove=dataset_frequent_genres.reduced_genres.value_counts()[dataset_frequent_genres.reduced_genres.value_counts()==1].index.tolist()
print("The sequences of genres assigned to only 1 movie: ", list_of_movies_to_remove)

In [None]:
"""
Below are the indexes of rows that should be removed from the dataset. In total 131 indexes.
With those final 48992 rows of the dataset, the stratified sampling will be successfully completed.
"""
indexes_to_remove=dataset_frequent_genres['reduced_genres'].map(lambda x: 1 if x in list_of_movies_to_remove else 0)[dataset_frequent_genres['reduced_genres'].map(lambda x: 1 if x in list_of_movies_to_remove else 0)==1].index.tolist()
dataset_frequent_genres=dataset_frequent_genres[~dataset_frequent_genres.index.isin(indexes_to_remove)]
dataset_frequent_genres=dataset_frequent_genres.reset_index(drop=True)

In [None]:
%%time
"""
This code shell may not be executed since it's already pickled after the transformation functions have been applied to each
column, which will be later used as model input.
"""
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Transfrom the columns:
# -> Actors
# -> Plot summary
# -> Movie Features
# -> Reviews

print("---------------------------------------------------------------------------------\n")
print("Transfrom the column of the actors\n")
transform_actors("actors_unified", dataset_frequent_genres) # function 3: transform_actors

print("Transfrom the column of the plot summary\n")
transform_plot("plot", dataset_frequent_genres) # function 3: transform_plot

print("Transfrom the column of the movie features\n")
transform_features("movie_features", dataset_frequent_genres) # function 3: transform_features

print("Transfrom the column of the movie reviews\n")
transform_reviews("reviews_pruned", dataset_frequent_genres) # function 3: transform_reviews

# I could pickle the dataset that contains the transformed columns of actors, plot, features and reviews.
# The serialization of the dataset could save me time from transforming each time the data.
# total time to transform the columns: 10 minutes

In [None]:
"""
This code cell was executed once, to serialize the dataset with the transformed "cleaned" columns of Actors, Plot, Features, Reviews
The cell will be executed to import the latest version of the dataset.
"""
#dataset_frequent_genres.to_pickle(os.path.join(os.getcwd(), "pickled_data_per_part\\dataset_frequent_genres_transformed_inputs_25032020.pkl"))

dataset_frequent_genres=pd.read_pickle(os.path.join(os.getcwd(), "pickled_data_per_part\\dataset_frequent_genres_transformed_inputs_25032020.pkl"))
dataset_frequent_genres.shape

In [None]:
"""
Before pre-processing the raw text of the first review about Toy Story
"""
dataset_frequent_genres.actors.iloc[0]

In [None]:
"""
After pre-processing the raw text of the first review about Toy Story
"""
dataset_frequent_genres.clean_actors.iloc[0]

#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Option 1: Random shuffle split (train, test)

In [None]:
%%time
"""
This is the first way to split the dataset, by using the random shuffle split of the Train_test_split function offered by sklearn module
This version was the first to be developed and followed, however we decided to try a second more robust option.
The second option refers to the data separation into train, validation and test set using the StratifiedShuffleSPlit function developed and mainted by sklearn module.

In cases of imbalanced datasets and specifically for classification models, the stratification comes in handy because it ensures that the data will be splitted uniformly and both the train and test sets, will enclude all the categorical variables.
"""
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Split the dataset into train & set sets
print("\n---------------------------------------------------------------------------------")
print("\nSplit the dataset into train & test sets (stratified shuffle split)\n")

X_train, X_test, y_train, y_test = split_dataset("stratified", dataset_frequent_genres.iloc[:, 13:30], dataset_frequent_genres, 0.2)

In [None]:
dataset_frequent_genres.columns

In [None]:
"""
The shape of the X_train, X_test, y_train, y_test splitted and shuffled randomly
"""
print("X_train shape:{}".format(X_train.shape))
print("X_test shape:{}".format(X_test.shape))
print("y_train shape:{}".format(y_train.shape))
print("y_test shape:{}".format(y_test.shape))

In [None]:
"""
The stratification worked!
"""
round(X_train.reduced_genres.explode().value_counts(normalize=True)*100,3)

In [None]:
"""
The stratification worked!
"""
round(X_test.reduced_genres.explode().value_counts(normalize=True)*100,3)

In [None]:
"""
The below cell serialises the X_train, X_test, y_train, y_test inputs created by the stratified split.
"""
# X_train.to_pickle(os.path.join(os.getcwd(), 'pickled_data_per_part\\X_train_all_inputs_25032020.pkl'))
# X_test.to_pickle(os.path.join(os.getcwd(), 'pickled_data_per_part\\X_test_all_inputs_25032020.pkl'))
# y_train.to_pickle(os.path.join(os.getcwd(), 'pickled_data_per_part\\y_train_all_inputs_25032020.pkl'))
# y_test.to_pickle(os.path.join(os.getcwd(), 'pickled_data_per_part\\y_test_all_inputs_25032020.pkl'))

Option 2: Stratified shuffle split (train, validation, test)

In [None]:
# %%time
# """
# Stratified split of the dataset, using the function 3.2, 3.3 built in the beggining of the notebook.

# Although promising, we experience the same error:

# "ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2."
# """
# # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# # Split the dataset into train, validation & test setsOption 1: Random shuffle split
# print("\n---------------------------------------------------------------------------------")
# print("\nSplit the dataset into train, validation & test sets\n")

# X_train_val, X_test, y_train_val, y_test = stratify_split_train_test(5, 0.2, dataset_frequent_genres, dataset_frequent_genres.iloc[:, 13:30].values)

Option 3: Stratified shuffle split (train, test) using the "iterative_train_test_split" method of skmultilearn module

In [None]:
# X_train, y_train, X_test, y_test = iterative_split_dataset(dataset_frequent_genres,
#                                                            dataset_frequent_genres.iloc[:, 13:30].values, 
#                                                            0.2)

In [None]:
# """
# The data split does not seem to be stratified.
# Thus, we end up to choose the train_test split approach of random shuffle split. Although, the data is splitted on train, test sets we will later used the validation_split parameters of tensorflow to use a proportion of the train dataset for validation
# """
# from collections import Counter
# from skmultilearn.model_selection.measures import get_combination_wise_output_matrix
# pd.DataFrame({
#     'train': Counter(str(combination) for row in get_combination_wise_output_matrix(y_train, order=2) for combination in row),
#     'test' : Counter(str(combination) for row in get_combination_wise_output_matrix(y_test, order=2) for combination in row)
# }).T.fillna(0.0)

#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#### Prune the most frequent genres (drama, comedy) - this is for the balancing the data

#### Check the frequency of each genre tag in TRAIN, TEST datasets and prune the high frequent genres to re-balance both train and test datasets

In [None]:
round(dataset_frequent_genres['reduced_genres'].explode().value_counts(normalize=True) * 100,3)

In [None]:
# Re-balance genre Drama

dataset_frequent_genres_drama = dataset_frequent_genres[dataset_frequent_genres["reduced_genres"].apply(lambda x: "Drama" in x)]
dataset_frequent_genres_drama_out = dataset_frequent_genres_drama.sample(frac=.85, random_state=1)
remove_indexes = dataset_frequent_genres_drama_out.index
dataset_frequent_genres_updated_version1 = dataset_frequent_genres[~dataset_frequent_genres.index.isin(remove_indexes)]
print("Dataset with drama pruned shape: {}".format(dataset_frequent_genres_updated_version1.shape))

In [None]:
round(X_train_updated_version1['reduced_genres'].explode().value_counts(normalize=True) * 100,3)

In [None]:
# Re-balance genre Comedy

dataset_frequent_genres_comedy = dataset_frequent_genres_updated_version1[dataset_frequent_genres_drama_updated_version1["reduced_genres"].apply(lambda x: "Comedy" in x)]
dataset_frequent_genres_comedy_out = dataset_frequent_genres_comedy.sample(frac=.75, random_state=1)
remove_indexes=dataset_frequent_genres_comedy_out.index
dataset_frequent_genres_updated_version2 = dataset_frequent_genres_drama_updated_version1[~dataset_frequent_genres_drama_updated_version1.index.isin(remove_indexes)]
dataset_frequent_genres_pruned=dataset_frequent_genres_updated_version2
dataset_frequent_genres_pruned=dataset_frequent_genres_pruned.reset_index(drop=True)
print("Dataset with comedy pruned shape: {}".format(dataset_frequent_genres_pruned.shape))

In [None]:
round(dataset_frequent_genres_pruned['reduced_genres'].explode().value_counts(normalize=True) * 100,3)

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

In [None]:
X_train, X_test, y_train, y_test = split_dataset(dataset_frequent_genres_pruned.iloc[:, 13:30], dataset_frequent_genres_pruned)

In [None]:
# Before dropping the frequent genre tags (Drama & Comedy)

print("X_train shape:{}".format(X_train.shape))
print("X_test shape:{}".format(X_test.shape))
print("y_train shape:{}".format(y_train.shape))
print("y_test shape:{}".format(y_test.shape))

#### End of data re-balancing (This is an approach that may be followed or may not!)
#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

In [None]:
%%time
"""
Now that the data is splitted, we separated each column of interest to a different X_train and X_test
Those train and text X sets will be later used for tokenization and padding
"""
# Separate each different input column (actors, plot, features, reviews)

X_train_actors = X_train[["title", "clean_actors", "reduced_genres"]]
X_train_plot = X_train[["title", "clean_plot_summary", "reduced_genres"]]
X_train_features = X_train[["title", "clean_combined_features", "reduced_genres"]]
X_train_reviews = X_train[["title", "clean_reviews", "reduced_genres"]]
# In X_train and X_test I also use columns "title" and "genres" since they will be both used later for making inference with predictions

assert X_train_actors.shape==X_train_plot.shape==X_train_features.shape==X_train_reviews.shape

X_test_actors = X_test[["title", "clean_actors", "reduced_genres"]]
X_test_plot = X_test[["title", "clean_plot_summary", "reduced_genres"]]
X_test_features = X_test[["title", "clean_combined_features", "reduced_genres"]]
X_test_reviews = X_test[["title", "clean_reviews", "reduced_genres"]]

assert X_test_actors.shape==X_test_plot.shape==X_test_features.shape==X_test_reviews.shape

In [None]:
# %%time
# """
# Prepare X and y for KFold cross validation of phase 3.2
# """
# # Separate each different input column (actors, plot, features, reviews)
# X=dataset_frequent_genres[['title', 'clean_actors', 'clean_plot_summary', 'clean_combined_features', 'clean_reviews', 'reduced_genres']]
# y=dataset_frequent_genres.iloc[:, 13:30].values

# X_actors = X[["title", "clean_actors", "reduced_genres"]]
# X_plot = X[["title", "clean_plot_summary", "reduced_genres"]]
# X_features = X[["title", "clean_combined_features", "reduced_genres"]]
# X_reviews = X[["title", "clean_reviews", "reduced_genres"]]
# # In X_train and X_test I also use columns "title" and "genres" since they will be both used later for making inference with predictions

# assert X_actors.shape==X_plot.shape==X_features.shape==X_reviews.shape

#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Token Frequency to determine best value for MAX_FREQUENCY_WORDS used later in word tokenization

In [None]:
%%time
"""
Find the most frequent words among the actor names
In the end the number of rows will be equal to the number of maximum features tokenized by the Tokenizer.
MAX_FEATURES=12526 (non-balanced) old
MAX_FEATURES=16333 (non-balanced) 06.03.2020 (old)
MAX_FEATURES=35961 09.03.2020

MAX_FEATURES=11065 (balanced)

dataset_frequent_genres if no re-balnaced is implemented!
dataset_frequent_genres_pruned if the dataset is re-balanced!

Question: Shall I choose the tokens that are more frequent or infrequent?
-> Probably the infrequent tokens will make a better classification
"""
def actors_split(s):
    return s.split(',')

corpus_actors=dataset_frequent_genres['clean_actors'].values.tolist() #dataset_frequent_genres, dataset_frequent_genres_pruned
c_vectorizer=CountVectorizer(tokenizer=actors_split, min_df=2)

X=c_vectorizer.fit_transform(corpus_actors)
X_words=c_vectorizer.inverse_transform(X)

tokens_list=c_vectorizer.get_feature_names()
count_list = np.asarray(X.sum(axis=0)).ravel().tolist()

token_frequency_df=pd.DataFrame({'term': c_vectorizer.get_feature_names(), 'token_frequency': count_list})
token_frequency_df=token_frequency_df.sort_values(by='token_frequency', ascending=False)
print(token_frequency_df.shape)
#token_frequency_df_pruned_actors=token_frequency_df[token_frequency_df['token_frequency']>=3]
#print(token_frequency_df_pruned_actors.shape)

# The below code sample creates a dictionart that was only the n-frequent actors
actors_frequency_dictionary=dict(zip(tokens_list, count_list))
d = dict((k, v) for k, v in actors_frequency_dictionary.items() if v >= 6)

In [None]:
print("The total number of actors that exist in the dataset is: {}".format(str(262794)))
print("The total number of actors that are present in more than 2 movies: {}".format(str(84250)))
print("Thus, the number of actors that have starred in only 1 movie is: {}".format(str(262794-84250)))
actors_tokenized=20000 #178544

In [None]:
%%time
"""
Find the most frequent words among the movie plots
In the end the number of rows will be equal to the number of maximum features tokenized by the Tokenizer.
MAX_FEATURES=13294 (non-balanced) old
MAX_FEATURES=10083 (non-balanced) 06.03.2020 (old)
MAX_FEATURES=37553

MAX_FEATURES=10909 (balanced)
"""
def plot_split(s):
    return s.split(' ')

corpus_plot=dataset_frequent_genres['clean_plot_summary'].values.tolist() #dataset_frequent_genres, dataset_frequent_genres_pruned
c_vectorizer=CountVectorizer(tokenizer=plot_split, min_df=2)

X=c_vectorizer.fit_transform(corpus_plot)
X_words=c_vectorizer.inverse_transform(X)

tokens_list=c_vectorizer.get_feature_names()
count_list = np.asarray(X.sum(axis=0)).ravel().tolist()

token_frequency_df=pd.DataFrame({'term': c_vectorizer.get_feature_names(), 'token_frequency': count_list})
token_frequency_df=token_frequency_df.sort_values(by='token_frequency', ascending=False)
print(token_frequency_df.shape)
#token_frequency_df_pruned_plot=token_frequency_df[token_frequency_df['token_frequency']>=3]
#print(token_frequency_df_pruned_plot.shape)

# The below code sample creates a dictionart that was only the n-frequent actors
plot_frequency_dictionary=dict(zip(tokens_list, count_list))
d = dict((k, v) for k, v in plot_frequency_dictionary.items() if v >= 6)

In [None]:
print("The total number of plot tokens that exist in the dataset is: {}".format(str(51568)))
print("The total number of plot tokens that are present in more than 2 movies: {}".format(str(23325)))
print("Thus, the number of plot tokens that are present in only 1 movie is: {}".format(str(51568-23325)))
plot_words_tokenized=20000 #28243

In [None]:
%%time
"""
Find the most frequent words among the movie features
In the end the number of rows will be equal to the number of maximum features tokenized by the Tokenizer.
MAX_FEATURES=17665 (non-balanced) old
MAX_FEATURES=14439 (non-balanced) 06.03.2020 (old)
MAX_FEATURES=122703

MAX_FEATURES=14964 (balanced)
"""
def movie_features_split(s):
    return s.split(' ')

corpus_features=dataset_frequent_genres['clean_combined_features'].values.tolist() #dataset_frequent_genres, dataset_frequent_genres_pruned
c_vectorizer=CountVectorizer(tokenizer=movie_features_split, min_df=2)

X=c_vectorizer.fit_transform(corpus_features)
X_words=c_vectorizer.inverse_transform(X)

tokens_list=c_vectorizer.get_feature_names()
count_list = np.asarray(X.sum(axis=0)).ravel().tolist()

token_frequency_df=pd.DataFrame({'term': c_vectorizer.get_feature_names(), 'token_frequency': count_list})
token_frequency_df=token_frequency_df.sort_values(by='token_frequency', ascending=False)
print(token_frequency_df.shape)
#token_frequency_df_pruned_features=token_frequency_df[token_frequency_df['token_frequency']>=3]
#print(token_frequency_df_pruned_features.shape)

# The below code sample creates a dictionart that was only the n-frequent actors
features_frequency_dictionary=dict(zip(tokens_list, count_list))
d = dict((k, v) for k, v in features_frequency_dictionary.items() if v >= 6)

In [None]:
print("The total number of feature tokens that exist in the dataset is: {}".format(str(186231)))
print("The total number of feature tokens that are present in more than 2 movies: {}".format(str(88366)))
print("Thus, the number of feature tokens that are present in only 1 movie is: {}".format(str(186231-88366)))
features_words_tokenized=20000 #97865
# I might choose the 2/3 of those 122703 words.

In [None]:
%%time
"""
Find the most frequent words among the movie reviews
In the end the number of rows will be equal to the number of maximum features tokenized by the Tokenizer.
MAX_FEATURES=16099 (non-balanced) old
MAX_FEATURES=15250 (non-balanced) 06.03.2020 (old)
MAX_FEATURES=194144

MAX_FEATURES=15404 (balanced)

"""
def reviews_split(s):
    return s.split(' ')

corpus_reviews=dataset_frequent_genres['clean_reviews'].values.tolist() #dataset_frequent_genres, dataset_frequent_genres_pruned
c_vectorizer=CountVectorizer(tokenizer=reviews_split, min_df=2)

X=c_vectorizer.fit_transform(corpus_reviews)
X_words=c_vectorizer.inverse_transform(X)

tokens_list=c_vectorizer.get_feature_names()
count_list = np.asarray(X.sum(axis=0)).ravel().tolist()

token_frequency_df=pd.DataFrame({'term': c_vectorizer.get_feature_names(), 'token_frequency': count_list})
token_frequency_df=token_frequency_df.sort_values(by='token_frequency', ascending=False)
print(token_frequency_df.shape)
#token_frequency_df_pruned_reviews=token_frequency_df[token_frequency_df['token_frequency']>=3]
#print(token_frequency_df_pruned_reviews.shape)

# The below code sample creates a dictionart that was only the n-frequent actors
reviews_frequency_dictionary=dict(zip(tokens_list, count_list))
d = dict((k, v) for k, v in reviews_frequency_dictionary.items() if v >= 6)

In [None]:
print("The total number of review tokens that exist in the dataset is: {}".format(str(252788)))
print("The total number of review tokens that are present in more than 2 movies: {}".format(str(83317)))
print("Thus, the number of review tokens that have starred that are present in only 1 movie is: {}".format(str(252788-83317)))
reviews_words_tokenized=20000 #169471

# I might choose the 2/3 of those 194144 words.

#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

In [None]:
%%time
"""
Data tokenization is one of the most important parts when dealing with text data.
Since I am going to deploy keras models, I use the python api of Keras Tokenizer,
more details about its use on: https://keras.io/preprocessing/text/
"""
#(This block of code should be executed each time the split range changes (80-20-> 50-50))
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Tokenize the dataset (using the keras tokenizer)
print("\n---------------------------------------------------------------------------------")
print("\nTokenize the dataset (using the keras tokenizer)\n")

vocabulary_size_frequent_words_actors, tokenizer_actors = keras_tokenization("actors", actors_tokenized, X_train_actors, X_test_actors) # function 5: keras_tokenization
print("\nActors tokenized with maximum number of words: {}\n".format(vocabulary_size_frequent_words_actors))

# Pickle the Actors Tokenizer
with open(os.path.join(os.getcwd(), '80-20 split_non-balanced\\actors_tokenizer_20000_25032020.pkl'), 'wb') as handle:
    pickle.dump(tokenizer_actors, handle, protocol=pickle.HIGHEST_PROTOCOL)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

vocabulary_size_frequent_words_plot, tokenizer_plot = keras_tokenization("plot", plot_words_tokenized, X_train_plot, X_test_plot) # function 5: keras_tokenization
print("\nPlot Summary tokenized with maximum number of words: {}\n".format(vocabulary_size_frequent_words_plot))

# Pickle the Plot Tokenizer
with open(os.path.join(os.getcwd(), '80-20 split_non-balanced\\plot_tokenizer_20000_25032020.pkl'), 'wb') as handle:
    pickle.dump(tokenizer_plot, handle, protocol=pickle.HIGHEST_PROTOCOL)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

vocabulary_size_frequent_words_features, tokenizer_features = keras_tokenization("features", features_words_tokenized, X_train_features, X_test_features) # function 5: keras_tokenization
print("\nMovie Features tokenized with maximum number of words: {}\n".format(vocabulary_size_frequent_words_features))

# Pickle the Movie Features Tokenizer
with open(os.path.join(os.getcwd(), '80-20 split_non-balanced\\features_tokenizer_20000_25032020.pkl'), 'wb') as handle:
    pickle.dump(tokenizer_features, handle, protocol=pickle.HIGHEST_PROTOCOL)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

vocabulary_size_frequent_words_reviews, tokenizer_reviews = keras_tokenization("reviews", reviews_words_tokenized, X_train_reviews, X_test_reviews) # function 5: keras_tokenization
print("\nMovie Reviews tokenized with maximum number of words: {}\n".format(vocabulary_size_frequent_words_reviews))

# Pickle the Reviews Tokenizer
with open(os.path.join(os.getcwd(), '80-20 split_non-balanced\\reviews_tokenizer_20000_25032020.pkl'), 'wb') as handle:
    pickle.dump(tokenizer_reviews, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# %%time
# """
# Data tokenization is one of the most important parts when dealing with text data.
# Since I am going to deploy keras models, I use the python api of Keras Tokenizer
# more details about its use on: https://keras.io/preprocessing/text/

# Prepared for Kfold cross validation
# """
# #(This block of code should be executed each time the split range changes (80-20-> 50-50))
# # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# # Tokenize the dataset (using the keras tokenizer)
# print("\n---------------------------------------------------------------------------------")
# print("\nTokenize the dataset (using the keras tokenizer)\n")

# vocabulary_size_frequent_words_actors, tokenizer_actors = keras_tokenization("actors", weights_df_pruned_actors.shape[0], X_actors) # function 5: keras_tokenization
# print("\nActors tokenized with maximum number of words: {}\n".format(vocabulary_size_frequent_words_actors))

# # Pickle the Actors Tokenizer
# with open('C:\\Users\\spano\\Desktop\\GitHub-Thesis\\models_text_classification\\80-20 split_non-balanced\\actors_tokenizer_28022020.pkl', 'wb') as handle:
#     pickle.dump(tokenizer_actors, handle, protocol=pickle.HIGHEST_PROTOCOL)
# # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

# vocabulary_size_frequent_words_plot, tokenizer_plot = keras_tokenization("plot", weights_df_pruned_plot.shape[0], X_plot) # function 5: keras_tokenization
# print("\nPlot Summary tokenized with maximum number of words: {}\n".format(vocabulary_size_frequent_words_plot))

# # Pickle the Plot Tokenizer
# with open('C:\\Users\\spano\\Desktop\\GitHub-Thesis\\models_text_classification\\80-20 split_non-balanced\\plot_tokenizer_28022020.pkl', 'wb') as handle:
#     pickle.dump(tokenizer_plot, handle, protocol=pickle.HIGHEST_PROTOCOL)
# # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

# vocabulary_size_frequent_words_features, tokenizer_features = keras_tokenization("features", weights_df_pruned_features.shape[0], X_features) # function 5: keras_tokenization
# print("\nMovie Features tokenized with maximum number of words: {}\n".format(vocabulary_size_frequent_words_features))

# # Pickle the Movie Features Tokenizer
# with open('C:\\Users\\spano\\Desktop\\GitHub-Thesis\\models_text_classification\\80-20 split_non-balanced\\features_tokenizer_28022020.pkl', 'wb') as handle:
#     pickle.dump(tokenizer_features, handle, protocol=pickle.HIGHEST_PROTOCOL)
# # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# vocabulary_size_frequent_words_reviews, tokenizer_reviews = keras_tokenization("reviews", weights_df_pruned_reviews.shape[0], X_reviews) # function 5: keras_tokenization
# print("\nMovie Reviews tokenized with maximum number of words: {}\n".format(vocabulary_size_frequent_words_reviews))

# # Pickle the Reviews Tokenizer
# with open('C:\\Users\\spano\\Desktop\\GitHub-Thesis\\models_text_classification\\80-20 split_non-balanced\\reviews_tokenizer_28022020.pkl', 'wb') as handle:
#     pickle.dump(tokenizer_reviews, handle, protocol=pickle.HIGHEST_PROTOCOL)

#### Comment: The three below blocks of code where executed once and then were pickled!

In [None]:
# X_train_reviews.loc[:, 'reviews_seqs'] = X_train_reviews.loc[:, 'reviews_seqs'].apply(lambda x: [[weights_df_pruned_reviews.shape[0]+1] if len(sublist)==0 else sublist for sublist in x])
# X_test_reviews.loc[:, 'reviews_seqs'] = X_test_reviews.loc[:, 'reviews_seqs'].apply(lambda x: [[weights_df_pruned_reviews.shape[0]+1] if len(sublist)==0 else sublist for sublist in x])

In [None]:
# X_train_reviews.loc[:, 'reviews_seqs'] = X_train_reviews.loc[:, 'reviews_seqs'].apply(lambda x: list(itertools.chain.from_iterable(x)))
# X_test_reviews.loc[:, 'reviews_seqs'] = X_test_reviews.loc[:, 'reviews_seqs'].apply(lambda x: list(itertools.chain.from_iterable(x)))

In [None]:
%%time
# BECAREFUL: y_train/y_test: BEFORE re-balancing the dataset, y_train_updated_version2/y_test_updated_version2: AFTER re-balancing the dataset
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Specify the length of the maxlen variable
print("\n---------------------------------------------------------------------------------")
print("\nSpecify the length of the maxlen variable (length is a parameter for the optimal padding execution)\n")

maxlen_actors = padding_sequnce_length("actors", X_train_actors) # function 6: padding_sequnce_length
maxlen_plot = padding_sequnce_length("plot", X_train_plot) # function 6: padding_sequnce_length
maxlen_features = padding_sequnce_length("features", X_train_features) # function 6: padding_sequnce_length
maxlen_reviews = padding_sequnce_length("reviews", X_train_reviews) # function 6: padding_sequnce_length

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Create the padding sequence of texts
print("\n---------------------------------------------------------------------------------")
print("\nCreate the padding sequence of texts\n")

X_train_seq_actors, X_test_seq_actors = padding_sequence("actors", X_train_actors, X_test_actors, y_train, y_test, maxlen_actors) # function 7: padding_sequence
print("\nActors padded sequences created\n")

X_train_seq_plot, X_test_seq_plot = padding_sequence("plot", X_train_plot, X_test_plot, y_train, y_test, maxlen_plot) # function 7: padding_sequence
print("Plot padded sequences created\n")

X_train_seq_features, X_test_seq_features = padding_sequence("features", X_train_features, X_test_features, y_train, y_test, maxlen_features) # function 7: padding_sequence
print("Movie Features padded sequences created\n")

X_train_seq_reviews, X_test_seq_reviews = padding_sequence("reviews", X_train_reviews, X_test_reviews, y_train, y_test, maxlen_reviews) # function 7: padding_sequence
print("Movie Reviews padded sequences created")

In [None]:
# %%time
# """
# For cross validation!
# """
# # BECAREFUL: y_train/y_test: BEFORE re-balancing the dataset, y_train_updated_version2/y_test_updated_version2: AFTER re-balancing the dataset
# # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# # Specify the length of the maxlen variable
# print("\n---------------------------------------------------------------------------------")
# print("\nSpecify the length of the maxlen variable (length is a parameter for the optimal padding execution)\n")

# maxlen_actors = padding_sequnce_length("actors", X_actors) # function 6: padding_sequnce_length
# maxlen_plot = padding_sequnce_length("plot", X_plot) # function 6: padding_sequnce_length
# maxlen_features = padding_sequnce_length("features", X_features) # function 6: padding_sequnce_length
# maxlen_reviews = padding_sequnce_length("reviews", X_reviews) # function 6: padding_sequnce_length

# # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# # Create the padding sequence of texts
# print("\n---------------------------------------------------------------------------------")
# print("\nCreate the padding sequence of texts\n")

# X_seq_actors = padding_sequence("actors", X_actors, y, maxlen_actors) # function 7: padding_sequence
# print("\nActors padded sequences created\n")

# X_seq_plot = padding_sequence("plot", X_plot, y, maxlen_plot) # function 7: padding_sequence
# print("Plot padded sequences created\n")

# X_seq_features = padding_sequence("features", X_features, y, maxlen_features) # function 7: padding_sequence
# print("Movie Features padded sequences created\n")

# X_seq_reviews = padding_sequence("reviews", X_reviews, y, maxlen_reviews) # function 7: padding_sequence
# print("Movie Reviews padded sequences created")

#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#### #1st case of data: 80-20 split and non-balanced dataset!

* X_train & X_test with <b>80-20</b> split and <b>non-balanced genre</b> tags

In [None]:
print("X_train_seq_actors shape:{}".format(X_train_seq_actors.shape)) #80-20 split
print("X_train_seq_plot shape:{}".format(X_train_seq_plot.shape)) #80-20 split
print("X_train_seq_features shape:{}".format(X_train_seq_features.shape)) #80-20 split
print("X_train_seq_reviews shape:{}\n".format(X_train_seq_reviews.shape)) #80-20 split

print("X_test_seq_actors shape:{}".format(X_test_seq_actors.shape)) #80-20 split
print("X_test_seq_plot shape:{}".format(X_test_seq_plot.shape)) #80-20 split
print("X_test_seq_features shape:{}".format(X_test_seq_features.shape)) #80-20 split
print("X_test_seq_reviews shape:{}".format(X_test_seq_reviews.shape)) #80-20 split

In [None]:
# """
# Data for Kfold cross validation
# """
# print("X_train_seq_actors shape:{}".format(X_seq_actors.shape)) #80-20 split non balanced
# print("X_train_seq_plot shape:{}".format(X_seq_plot.shape)) #80-20 split non balanced
# print("X_train_seq_features shape:{}".format(X_seq_features.shape)) #80-20 split non balanced
# print("X_train_seq_reviews shape:{}\n".format(X_seq_reviews.shape)) #80-20 split non balanced

# print("y_train shape:{}".format(y.shape)) #80-20 split non balanced
# print("y_test shape:{}".format(y.shape)) #80-20 split non balanced

* y_train & y_test with <b>80-20</b> split and <b>non-balanced genre</b> tags

In [None]:
print("y_train shape:{}".format(y_train.shape)) #80-20 split
print("y_test shape:{}".format(y_test.shape)) #80-20 split

In [None]:
np.save(os.path.join(os.getcwd(), "80-20 split_non-balanced\\x_train_seq_actors_80-20_non-balanced_20000_25032020"), X_train_seq_actors)
np.save(os.path.join(os.getcwd(), "80-20 split_non-balanced\\x_train_seq_plot_80-20_non-balanced_20000_25032020"), X_train_seq_plot)
np.save(os.path.join(os.getcwd(), "80-20 split_non-balanced\\x_train_seq_features_80-20_non-balanced_20000_25032020"), X_train_seq_features)
np.save(os.path.join(os.getcwd(), "80-20 split_non-balanced\\x_train_seq_reviews_80-20_non-balanced_20000_25032020"), X_train_seq_reviews)

np.save(os.path.join(os.getcwd(), "80-20 split_non-balanced\\x_test_seq_actors_80-20_non-balanced_20000_25032020"), X_test_seq_actors)
np.save(os.path.join(os.getcwd(), "80-20 split_non-balanced\\x_test_seq_plot_80-20_non-balanced_20000_25032020"), X_test_seq_plot)
np.save(os.path.join(os.getcwd(), "80-20 split_non-balanced\\x_test_seq_features_80-20_non-balanced_20000_25032020"), X_test_seq_features)
np.save(os.path.join(os.getcwd(), "80-20 split_non-balanced\\x_test_seq_reviews_80-20_non-balanced_20000_25032020"), X_test_seq_reviews)

np.save(os.path.join(os.getcwd(), "80-20 split_non-balanced\\y_train_80-20_non-balanced_20000_25032020"), y_train) #np.save: saves a multi-hot encoded dataframe as array!
np.save(os.path.join(os.getcwd(), "80-20 split_non-balanced\\y_test_80-20_non-balanced_20000_25032020"), y_test)

In [None]:
# """
# Data for Kfold cross validation
# """
# np.save("x_seq_actors_80-20_non-balanced_28022020", X_seq_actors)
# np.save("x_seq_plot_80-20_non-balanced_28022020", X_seq_plot)
# np.save("x_seq_features_80-20_non-balanced_28022020", X_seq_features)
# np.save("x_seq_reviews_80-20_non-balanced_28022020", X_seq_reviews)

# np.save("y_80-20_non-balanced_28022020", y) #np.save: saves a multi-hot encoded dataframe as array!

#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#### #2nd case of data: 50-50 split and non-balanced dataset!

* X_train & X_test with <b>50-50</b> split and <b>non-balanced genre</b> tags

In [None]:
print("X_train_seq_actors shape:{}".format(X_train_seq_actors.shape)) #50-50 split
print("X_train_seq_plot shape:{}".format(X_train_seq_plot.shape)) #50-50 split
print("X_train_seq_features shape:{}".format(X_train_seq_features.shape)) #50-50 split
print("X_train_seq_reviews shape:{}\n".format(X_train_seq_reviews.shape)) #50-50 split

print("X_test_seq_actors shape:{}".format(X_test_seq_actors.shape)) #50-50 split
print("X_test_seq_plot shape:{}".format(X_test_seq_plot.shape)) #50-50 split
print("X_test_seq_features shape:{}".format(X_test_seq_features.shape)) #50-50 split
print("X_test_seq_reviews shape:{}".format(X_test_seq_reviews.shape)) #50-50 split

* y_train & y_test with <b>50-50</b> split and <b>non-balanced genre</b> tags

In [None]:
print("y_train shape:{}".format(y_train.shape)) #50-50 split
print("y_test shape:{}".format(y_test.shape)) #50-50 split

In [None]:
np.save("x_train_seq_actors_50-50_non-balanced_07022020", X_train_seq_actors)
np.save("x_train_seq_plot_50-50_non-balanced_07022020", X_train_seq_plot)
np.save("x_train_seq_features_50-50_non-balanced_07022020", X_train_seq_features)
np.save("x_train_seq_reviews_50-50_non-balanced_07022020", X_train_seq_reviews)

np.save("x_test_seq_actors_50-50_non-balanced_07022020", X_test_seq_actors)
np.save("x_test_seq_plot_50-50_non-balanced_07022020", X_test_seq_plot)
np.save("x_test_seq_features_50-50_non-balanced_07022020", X_test_seq_features)
np.save("x_test_seq_reviews_50-50_non-balanced_07022020", X_test_seq_reviews)

np.save("y_train_50-50_non-balanced_07022020", y_train)
np.save("y_test_50-50_non-balanced_07022020", y_test)

#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#### #3rd case of data: 50-50 split and balanced dataset!

* X_train & X_test with <b>50-50</b> split and <b>balanced genre</b> tags

In [None]:
print("X_train_seq_actors shape:{}".format(X_train_seq_actors.shape)) #50-50 split, balanced genres
print("X_train_seq_plot shape:{}".format(X_train_seq_plot.shape)) #50-50 split, balanced genres
print("X_train_seq_features shape:{}".format(X_train_seq_features.shape)) #50-50 split, balanced genres
print("X_train_seq_reviews shape:{}\n".format(X_train_seq_reviews.shape)) #50-50 split, balanced genres

print("X_test_seq_actors shape:{}".format(X_test_seq_actors.shape)) #50-50 split, balanced genres
print("X_test_seq_plot shape:{}".format(X_test_seq_plot.shape)) #50-50 split, balanced genres
print("X_test_seq_features shape:{}".format(X_test_seq_features.shape)) #50-50 split, balanced genres
print("X_test_seq_reviews shape:{}".format(X_test_seq_reviews.shape)) #50-50 split, balanced genres

* y_train & y_test with <b>50-50</b> split and <b>balanced genre</b> tags

In [None]:
print("y_train shape:{}".format(y_train_updated_version2.shape)) #50-50 split, balanced genres
print("y_test shape:{}".format(y_test_updated_version2.shape)) #50-50 split, balanced genres

In [None]:
np.save("x_train_seq_actors_50-50_balanced_07022020", X_train_seq_actors)
np.save("x_train_seq_plot_50-50_balanced_07022020", X_train_seq_plot)
np.save("x_train_seq_features_50-50_balanced_07022020", X_train_seq_features)
np.save("x_train_seq_reviews_50-50_balanced_07022020", X_train_seq_reviews)

np.save("x_test_seq_actors_50-50_balanced_07022020", X_test_seq_actors)
np.save("x_test_seq_plot_50-50_balanced_07022020", X_test_seq_plot)
np.save("x_test_seq_features_50-50_balanced_07022020", X_test_seq_features)
np.save("x_test_seq_reviews_50-50_balanced_07022020", X_test_seq_reviews)

np.save("y_train_50-50_balanced_07022020", y_train_updated_version2)
np.save("y_test_50-50_balanced_07022020", y_test_updated_version2)

#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#### #4th case of data: 80-20 split and balanced dataset!

* X_train & X_test with <b>80-20</b> split and <b>balanced genre</b> tags

In [None]:
print("X_train_seq_actors shape:{}".format(X_train_seq_actors.shape)) #80-20 split, balanced genres
print("X_train_seq_plot shape:{}".format(X_train_seq_plot.shape)) #80-20 split, balanced genres
print("X_train_seq_features shape:{}".format(X_train_seq_features.shape)) #80-20 split, balanced genres
print("X_train_seq_reviews shape:{}\n".format(X_train_seq_reviews.shape)) #80-20 split, balanced genres

print("X_test_seq_actors shape:{}".format(X_test_seq_actors.shape)) #80-20 split, balanced genres
print("X_test_seq_plot shape:{}".format(X_test_seq_plot.shape)) #80-20 split, balanced genres
print("X_test_seq_features shape:{}".format(X_test_seq_features.shape)) #80-20 split, balanced genres
print("X_test_seq_reviews shape:{}".format(X_test_seq_reviews.shape)) #80-20 split, balanced genres

* y_train & y_test with <b>80-0</b> split and <b>balanced genre</b> tags

In [None]:
print("y_train shape:{}".format(y_train.shape)) #80-20 split, balanced genres
print("y_test shape:{}".format(y_test.shape)) #80-20 split, balanced genres

In [None]:
np.save("x_train_seq_actors_80-20_balanced_23022020", X_train_seq_actors)
np.save("x_train_seq_plot_80-20_balanced_23022020", X_train_seq_plot)
np.save("x_train_seq_features_80-20_balanced_23022020", X_train_seq_features)
np.save("x_train_seq_reviews_80-20_balanced_23022020", X_train_seq_reviews)

np.save("x_test_seq_actors_80-20_balanced_23022020", X_test_seq_actors)
np.save("x_test_seq_plot_80-20_balanced_23022020", X_test_seq_plot)
np.save("x_test_seq_features_80-20_balanced_23022020", X_test_seq_features)
np.save("x_test_seq_reviews_80-20_balanced_23022020", X_test_seq_reviews)

np.save("y_train_80-20_balanced_23022020", y_train)
np.save("y_test_80-20_balanced_23022020", y_test)

#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#### Pickle the "dataset_frequent_genres" with the added cleaned columns of actors, plot, features and reviews

#### Pickle the X_test dataset for use in part 3.2

In [None]:
dataset_frequent_genres.to_pickle(os.path.join(os.getcwd(), "pickled_data_per_part\\dataset_part_3.1_25032020.pkl"))

In [None]:
X_test.to_pickle(os.path.join(os.getcwd(), "80-20 split_non-balanced\\20000_max_features\\x_test_20000_25032020.pkl"))

#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#### THIS IS THE END OF PART 3.1 - Where tokenization, cleaning and balancing of the data took place.
#### The next PART 3.2, focuses on training & validating different models neural models based on the data prepaired on PART 3.1