### Part 3.1 - Data Tokenization-Transformation

Set the version data control parameter (to save the outputs of this notebook at their latest date)

In [None]:
version_data_control="13072020"

#### Import the libraries

In [None]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from tabulate import tabulate
import re
import os
import time
from humanfriendly import format_timespan
import random

# Module to serialize the content produced from the execution of the code

import pickle

# Module to monitor the progress of a python for loop

from tqdm import tqdm

# Module to manipulate text in python - NLTK package

import nltk
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer

import spacy
import unidecode
from word2number import w2n

# Module to compute word vectorizers and compute the cosine distance

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.feature_extraction import text

import string
import itertools

from scipy import stats

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
def set_pandas_display_options() -> None:
    display = pd.options.display

    display.max_columns = 1000
    display.max_rows = 1000
    display.max_colwidth = 199
    display.width = None
    # display.precision = 2  # set as needed

set_pandas_display_options()

#### Keras Tokenization and Plotting Libraries

In [None]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from skmultilearn.model_selection import iterative_train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

#--------------------------------------------------------------

import tensorflow as tf
from tensorflow import keras

#---------------------------------------------------------------

%matplotlib inline
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
from pylab import rcParams

from packaging import version

print("TensorFlow version: ", tf.__version__)
assert version.parse(tf.__version__).release[0] >= 2, \
    "This notebook requires TensorFlow 2.0 or above."

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

#### Import the dataset (this demonstrates how the genres have been cleaned)

In [None]:
dataset= pd.read_pickle(os.path.join(os.getcwd(), 'pickled_data_per_part\\dataset_part_2_20072020.pkl'.format(version_data_control))) #should be the latest version of the data created from part 2.
print("\nThe shape of the dataset that will be used in Keras classifier is: {}".format(dataset.shape))

In [None]:
dataset[dataset.title=="Les Misérables"]

#### -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

#### Understand the dependent variable: Genres of each movie

Check their frequency distribution

In [None]:
dataset['genres'].explode().value_counts()

In [None]:
round(dataset['genres'].explode().value_counts(normalize=True) * 100,3)

In [None]:
# STEP 1: Remove genres less than 1% frequency

dataset['reduced_genres'] = dataset['genres'].apply(
    lambda row: [val for val in row if val not in ['IMAX', 'Sport', 'Adult', 'News', 'Reality-TV',
                                                   'Film-Noir', 'Short', 'Family', 'Biography', 'Music', 'History']])

In [None]:
dataset['reduced_genres'].shape

In [None]:
# STEP 2: Find indexes with EMPTY LISTS

dataset_empty_lists = dataset[dataset.reduced_genres.apply(lambda c: c==[])]

remove_indices = dataset_empty_lists.index.to_list()

dataset_empty_lists

In [None]:
# STEP 3: Remove the indexes with EMPTY LISTS

dataset_frequent_genres =  dataset[~dataset.index.isin(remove_indices)]

dataset_frequent_genres.shape

In [None]:
dataset_frequent_genres = dataset_frequent_genres.reset_index(drop=True)

In [None]:
"""
Having cut the most scarse occurences of genres it is still obvious that genres "Drama" & "Comedy" belong to 40% of the movies.
A good approach is either to up-sample the dataset or down-sample it.
What we chose was to down-sample the two dominant genres "Drama" & "Comedy". However, in the sub-part 3.2 
We used the imbalanced dataset to train and test the keras text classification models.
"""
round(dataset_frequent_genres['reduced_genres'].explode().value_counts(normalize=True) * 100,3)

In [None]:
"""
The dataset below contains 17 out of 27 genres. The 11 genres cut were not frequent enough compared to the rest of the genres.
"""
# dataset_frequent_genres.to_pickle("pickled_data_per_part\\dataset_part_2_cleaned_of_redundant_genres_20072020.pkl")

#### -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

#### #1 Import cleaned of redundant genres dataset and genres_list

In [None]:
# import dataset that was pickled in the previous cell (above)
dataset_frequent_genres = pd.read_pickle(os.path.join(os.getcwd(), 'pickled_data_per_part\\dataset_part_2_cleaned_of_redundant_genres_20072020.pkl'))

# REPLACE THE TEXT OF THOSE MOVIE PLOTS & FEATURES WITH THOSE BELOW, OTHERWISE THE PREPROCESSING WON'T COMPLETE CORRECTLY
dataset_frequent_genres['plot'].iloc[10802]='A documentary looking at the Nigga word'
dataset_frequent_genres['plot'].iloc[15148]="A shy Arun works as an accountant. On daily basis he travels by a bus to office where he sees Prabha who works in the same premises. Arun starts to like Prabha and follows her frequently but hasn't got the confidence in him to approach her. When ever Arun gains confidence to share about his feelings to Prabha her friend Nagesh steps in spoiling Arun's plans. In order to win his love for Prabha, Arun travels all the way to Lonavala to take the help of expert Colonel Julius Naganderanath Wilfred Singh. During this phase Prabha to starts missing Arun."
dataset_frequent_genres['plot'].iloc[31993]="MARTIN ARMSTRONG, once a US based trillion dollar financial adviser, used the number pi to predict economic turning points with precision. When some big New York bankers asked him to join the club to help them to take over Russia, he refused to join the manipulation. A few days later the FBI stormed his offices accusing him of a 3 billion dollar Ponzi Scheme - an attempt to stop him talking about the real Ponzi Scheme of debts that the US has build up over the years and which he thinks starts to collapse after October 1, 2015, a major pi turning point he is predicting."
dataset_frequent_genres['plot'].iloc[32857]="Notorious gangster asks a career thief by the name of Nandu to steal a package worth over one crore rupees for a fee, which Nandu does so. Things do not go as planned, when Nandu finds out that the package is worth more than twice the amount, he refuses to hand over the package to Pinky, until he raises his fees. An enraged Pinky instructs his men to locate Nandu, find the package and bring it back, and thereafter kill Nandu. While on the run from Pinky's gangsters, the police find out that Nandu was involved in this theft, and they launch a man-hunt. What Nandu does not know that the package that he has on his person does not contain money or gold, but a nuclear bomb that is set to explode."
dataset_frequent_genres['plot'].iloc[33094]="Before New Year Eve the windows of vast city are glowing from within by colored Christmas lights. It seems that behind each of these windows carefully stored favorite holiday. It seems that each of them are waiting for the New Year's wish fulfillment. By the silhouette of an elderly woman at one of the windows of the old house in Moscow, neighbors in the yard probably accustomed as something unchangeable. Sophia Ivanovna ten years does not get up from her chair, all day looking out the window, glues paper figurines and listening to Dickens, who reads aloud to her, her only daughter Tanya. The only heiress, Tanya, whom Sophia Ivanovna must to pass all the family jewels, seems resigned to the position of an old maid, and her whole life consists only of the care of the sick mother."
dataset_frequent_genres['plot'].iloc[36100]="Mr. Giant has kidnapped the brilliant Dr. Van Kohler and is planning to use the Doctor's invention, the N-bomb, to hold the world hostage. The only one who can foil Mr. Giant's evil scheme is Agent 00, a 3-foot-tall filipino martial arts master, expert marksman, top-class romancer and all-around superspy. Can Agent 00 rescue Dr. Kohler before it's too late?"
dataset_frequent_genres['plot'].iloc[36669]="HEADER portrays the grueling psychological journey taken by ATF Agent Stewart Cummings. On the surface, Stewart struggles to solve a string of bizarre murders, but in secret, his life falls into a world of corruption that's impossible to escape. Deceit, rape, and murder spiral out of control triggering a hellish conclusion that defies description."
dataset_frequent_genres['plot'].iloc[47725]="Raj is one of city's top lawyer known never to loose a case. He his introduced to Sargam during the launch of her new album by Ashwin Mehta owner of a music company. Next day Ashwin is found murdered and Tarang is held for the murder, Sargam and Tarang are childhood friends and she approaches Raj to fight for his case. During this period Raj notices that Tarang suffers from mental illness of split personality and when he suffers the attack he is not Tarang but Ranjeet and doesn't remember things he does when he turns Ranjeet. Tarang had killed Ashwin as the previous night he saw him forcing on Sargam but doesn't remember killing him as he was Ranjeet that time. Raj wins the case in court by proving Tarang innocent as he is suffering from mental illness. Tarang is due to release and gives Raj a shock to that he had planned everything regarding split personality as he was caught red handed on the site of murder."

dataset_frequent_genres['movie_features'].iloc[8102]="Spinning Boris Jeff Goldblum Anthony LaPaglia Liev Schreiber Boris Lee Krutonog Svetlana Efremova Shauna MacDonald Gregory Hlady Vladimir Radian Ilia Volok Konstantin Kazakov Judah Katz Maria Syrgiannis Ola Sturik Gillian Vanderburgh Serge Timokhin Roger Spottiswoode Early in 1996, three Republican campaign operatives take a job in secret assisting Boris Yeltsin's reelection. Once in Moscow, they find he's polling at 6 percent with the election a few months away. While Dick Dresner wants to go home, George Gorton and Joe Shumate vote to stay. First, they must get someone's attention; they succeed finally with Yeltsin's daughter. Then it's polling, focus groups, messages and spin. Even as Yeltsin's numbers go up, the trio are unsure who hired them and whether Yeltsin's allies have a different plan in mind than election victory. When the going gets toughest, it's Gorton who puts a spin on our stake: democracy and capitalism must win. Comedy Drama"
dataset_frequent_genres['movie_features'].iloc[10802]="The N Word F. Lee Bailey Sandra Bernhard Donald Bogle Todd Boyd Elaine Brown LeVar Burton George Carlin Morris Chestnut Chuck D Johnnie L. Cochran Jr. Stanley Crouch Damon Dash Dr. Dre Eazy-E Laurence Fishburne Todd Williams A documentary looking at the Nigga word Documentary"
dataset_frequent_genres['movie_features'].iloc[15148]="Chhoti Si Baat Ashok Kumar Vidya Sinha Amol Palekar Asrani Nandita Thakur Rajan Haksar Ashim Kumar Devendra Khandelwal Baba Majgoakar Noni Ganguly C.S. Dubey Amol Sen R.S. Chopra Sudarshan Sahni Milon Mukerji Basu Chatterjee A shy Arun works as an accountant. On daily basis he travels by a bus to office where he sees Prabha who works in the same premises. Arun starts to like Prabha and follows her frequently but hasn't got the confidence in him to approach her. When ever Arun gains confidence to share about his feelings to Prabha her friend Nagesh steps in spoiling Arun's plans. In order to win his love for Prabha, Arun travels all the way to Lonavala to take the help of expert Colonel Julius Naganderanath Wilfred Singh. During this phase Prabha to starts missing Arun. Comedy Romance"
dataset_frequent_genres['movie_features'].iloc[31993]="The Forecaster Martin Armstrong Vicky Armstrong Oliver Brown Michael Campbell Larry Edelson Tony Godin Nigel Kirwan Barclay Leib Neill MacPherson Thomas Sjoblom Marcus Vetter MARTIN ARMSTRONG, once a US based trillion dollar financial adviser, used the number pi to predict economic turning points with precision. When some big New York bankers asked him to join the club to help them to take over Russia, he refused to join the manipulation. A few days later the FBI stormed his offices accusing him of a 3 billion dollar Ponzi Scheme - an attempt to stop him talking about the real Ponzi Scheme of debts that the US has build up over the years and which he thinks starts to collapse after October 1, 2015, a major pi turning point he is predicting. Documentary"
dataset_frequent_genres['movie_features'].iloc[32857]="Daud Sanjay Dutt Urmila Matondkar Paresh Rawal Neeraj Vora Ram Mohan Ashish Vidyarthi Manoj Bajpayee Rajeev Mehta Jeetendra Shastri Sumukhi Tarzan Vineeth Narsing Yadav Rana Jung Bahadur Sunil Shende Ram Gopal Varma Notorious gangster asks a career thief by the name of Nandu to steal a package worth over one crore rupees for a fee, which Nandu does so. Things do not go as planned, when Nandu finds out that the package is worth more than twice the amount, he refuses to hand over the package to Pinky, until he raises his fees. An enraged Pinky instructs his men to locate Nandu, find the package and bring it back, and thereafter kill Nandu. While on the run from Pinky's gangsters, the police find out that Nandu was involved in this theft, and they launch a man-hunt. What Nandu does not know that the package that he has on his person does not contain money or gold, but a nuclear bomb that is set to explode. Action"
dataset_frequent_genres['movie_features'].iloc[33094]="Come Look at Me Oleg Yankovskiy Irina Kupchenko Yekaterina Vasilyeva Natalya Shchukina Mark Rudinshtejn Ivan Yankovskiy Mikhail Agranovich Before New Year Eve the windows of vast city are glowing from within by colored Christmas lights. It seems that behind each of these windows carefully stored favorite holiday. It seems that each of them are waiting for the New Year's wish fulfillment. By the silhouette of an elderly woman at one of the windows of the old house in Moscow, neighbors in the yard probably accustomed as something unchangeable. Sophia Ivanovna ten years does not get up from her chair, all day looking out the window, glues paper figurines and listening to Dickens, who reads aloud to her, her only daughter Tanya. The only heiress, Tanya, whom Sophia Ivanovna must to pass all the family jewels, seems resigned to the position of an old maid, and her whole life consists only of the care of the sick mother Comedy Drama Romance"
dataset_frequent_genres['movie_features'].iloc[36100]="For Y'ur Height Only Weng Weng Yehlen Catral Carmi Martin Anna Marie Gutierrez Beth Sandoval Eddie Nicart Mr. Giant has kidnapped the brilliant Dr. Van Kohler and is planning to use the Doctor's invention, the N-bomb, to hold the world hostage. The only one who can foil Mr. Giant's evil scheme is Agent 00, a 3-foot-tall filipino martial arts master, expert marksman, top-class romancer and all-around superspy. Can Agent 00 rescue Dr. Kohler before it's too late? Action Comedy"
dataset_frequent_genres['movie_features'].iloc[36669]="Header Jake Suffian Elliot V. Kotek Dick Mullaney Michael Philip Anthony Stacey Brooks Tara Brooks Jim Coope Bill Corry Andrew Cowen Amanda Czelinski Stephen DeCaires Kevin Dedes Lauren Devlin Ruth Dimino Morris Fazzi Jr. Archibald Flancranstin HEADER portrays the grueling psychological journey taken by ATF Agent Stewart Cummings. On the surface, Stewart struggles to solve a string of bizarre murders, but in secret, his life falls into a world of corruption that's impossible to escape. Deceit, rape, and murder spiral out of control triggering a hellish conclusion that defies description. Horror"
dataset_frequent_genres['movie_features'].iloc[47725]="Deewangee Ajay Devgn Akshaye Khanna Urmila Matondkar Farida Jalal Vijayendra Ghatge Seema Biswas Tiku Talsania Tanaaz Currim Irani Mohan Kapoor Nirmal Pandey Nishigandha Wad Rana Jung Bahadur Suresh Oberoi Sushovan Banerjee Suhasini Mulay Anees Bazmee Raj is one of city's top lawyer known never to loose a case. He his introduced to Sargam during the launch of her new album by Ashwin Mehta owner of a music company. Next day Ashwin is found murdered and Tarang is held for the murder, Sargam and Tarang are childhood friends and she approaches Raj to fight for his case. During this period Raj notices that Tarang suffers from mental illness of split personality and when he suffers the attack he is not Tarang but Ranjeet and doesn't remember things he does when he turns Ranjeet. Tarang had killed Ashwin as the previous night he saw him forcing on Sargam but doesn't remember killing him as he was Ranjeet that time. Raj wins the case in court by proving Tarang innocent as he is suffering from mental illness. Tarang is due to release and gives Raj a shock to that he had planned everything regarding split personality as he was caught red handed on the site of murder. Crime Drama Mystery"

# DELETE THAT MOVIE
dataset_frequent_genres = dataset_frequent_genres[dataset_frequent_genres['title'] != "6 Days to Air: The Making of South Park"]

dataset_frequent_genres=dataset_frequent_genres.reset_index(drop=True)
print("\nThe shape of the dataset that will be used in Keras classifier is: {}".format(dataset_frequent_genres.shape))
# Comment: From now on, "reduced_genres" column will be used for model classification and predictions.

In [None]:
"""
Multi-hot encoding is a good practice to transform the value y into a data structure appropriate for multi-label text calssification.
"""
# Multy hot encoding since a Movie can have more than 1 genres assigned!

mlb = MultiLabelBinarizer()
dataset_frequent_genres = dataset_frequent_genres.join(pd.DataFrame(mlb.fit_transform(dataset_frequent_genres['reduced_genres']),
                                                                    columns=mlb.classes_,
                                                                    index=dataset_frequent_genres.index))

In [None]:
# import genres
with open(os.path.join(os.getcwd(), "pickled_data_per_part\\genres_list_06032020.pkl"), 'rb') as handle:
    genres_list = pickle.load(handle)
genres_list

#### #2 Prune the movie reviews (keep only the first review for each movie)

In [None]:
dataset_frequent_genres['reviews_length'] = dataset_frequent_genres.reviews.apply(lambda x: len(x))

In [None]:
len(dataset_frequent_genres['reviews_length'][dataset_frequent_genres['reviews_length']==1])

# Since we don't want to loose 3326 movies, we will keep only the first review for each movie.

In [None]:
dataset_frequent_genres.loc[:, 'reviews_pruned'] = dataset_frequent_genres.reviews.apply(lambda x: x[0])

In [None]:
"""
We observed that a plain text of a reviews as such of a plot summary, contain a lot of stop-words, punctuations and "noisy" words
that could spoil the results of a text classification model.
"""
print("Raw text of a movie review:", dataset_frequent_genres.reviews_pruned.iloc[7])
print('\n')
print("Raw text of a plot summary: ", dataset_frequent_genres['plot'].iloc[7])

#### -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

#### #3 Unify (join) the columns of Actors and Reviews in order to achive a dataframe cell with a unique TEXT (corpus) and not a LIST of texts

In [None]:
def clean_single_letter_actors(cast_list):

    cleaned_actors=[actor for actor in cast_list if len(actor)>=4]
    
    return cleaned_actors
    
def apply_clean_single_letter_actors(column_name, dataset):
    
    tqdm.pandas()
    
    dataset.loc[:, 'actors_cleaned'] = dataset.loc[:, column_name].progress_apply(lambda x: clean_single_letter_actors(x))

In [None]:
apply_clean_single_letter_actors("actors", dataset_frequent_genres)

In [None]:
dataset_frequent_genres.shape

In [None]:
# Now the movies do not contain actors with names of single, 2, or 3 letters
mask=dataset_frequent_genres.actors_cleaned.explode().str.len().eq(6)
res=dataset_frequent_genres[['title', 'actors_cleaned']].loc[np.unique(mask.loc[mask].index)]
display(res)

In [None]:
# Function 1: Actors
def unify_actors(row):
    return ','.join(row['actors_cleaned']).strip()

# Function 2: Reviews
def unify_reviews(row):
    return ', '.join(row['reviews'])

In [None]:
dataset_frequent_genres['actors_unified'] = dataset_frequent_genres.apply(unify_actors, axis=1)
dataset_frequent_genres['reviews_unified'] = dataset_frequent_genres.apply(unify_reviews, axis=1)

print("Actors before: {}".format(dataset_frequent_genres.actors.iloc[0]))
print("\nActors after: {}\n".format(dataset_frequent_genres.actors_unified.iloc[0]))

print("\nReviews before: {}".format(dataset_frequent_genres.reviews.iloc[0]))
print("\nReviews after: {}".format(dataset_frequent_genres.reviews_unified.iloc[0]))

#### #4 Functions

In [None]:
"""
Functions used across the whole notebook.
Those functions are explisetely used to pre-process the raw data input of texts
"""
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

# Function 1
nlp=spacy.load('en_core_web_md')

def decontracted(phrase):
    # specific contractions & number warnings
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"multimillion", "multi million", phrase)
    phrase = re.sub(r"multibillion", "multi billion", phrase)
    phrase = re.sub(r"trillion", "1000000000000", phrase)
    phrase = re.sub(r"billion", "1000000000", phrase)
    phrase = re.sub(r"crore", "10000000", phrase)
    phrase = re.sub(r"mln", "1000000", phrase)
    phrase = re.sub(r"III", "3", phrase)
    phrase = re.sub(r"II", "2", phrase)
    phrase = re.sub(r"iii", "3", phrase)
    phrase = re.sub(r"world war ii", "world war 2", phrase)
    phrase = re.sub(r"world war i", "world war 1", phrase)
    
    # specific phrases mismatched as NUM part of speach
    phrase = re.sub(r"HEADER", "The movie", phrase)
    phrase = re.sub(r"named V", "", phrase)
    phrase = re.sub(r"(die fetten jahre sind vorbei)", "", phrase)
    phrase = re.sub(r"thiry", "Thiry", phrase)
    phrase = re.sub(r"Kirsten deLohr Helland", "Kirsten Helland", phrase)
    phrase = re.sub(r"a.k.a", "", phrase)
    phrase = re.sub(r'(?<=[.,"])(?=[^\s])', " ", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def correct_words_with_punctuation(phrase):
    
    phrase = re.sub(r"sci-fi", "science fiction", phrase)
    phrase = re.sub(r"N-bomb", "nuclear bomb", phrase)
    phrase = re.sub(r"U-boat", "submarine", phrase)
    
    return phrase
    
def preprocess_text_movie_content(raw_text): #Movie Content aka column name: movie_features
    
    # 1.Convert Accented Characters to ASCII
    raw_text = unidecode.unidecode(raw_text)
    
    #------------------------------------------------
    
    # 2.1 Expand Contractions
    raw_text_decontracted=decontracted(raw_text)
    
    # 2.2 Correct punctuation
    raw_text_correct_punctuation=correct_words_with_punctuation(raw_text_decontracted)
    #------------------------------------------------
    
    # 3.Remove punctuation
    re_punc=re.compile('[%s]' % re.escape(string.punctuation))

    stripped_punctuation=[re_punc.sub(' ', w) for w in raw_text_correct_punctuation.split(' ')]
    
    #------------------------------------------------
    
    # 4.Strip white space
    stripped_white_space=[w.strip() for w in stripped_punctuation]
    
    #------------------------------------------------
    
    # 5.Remove numbers
    # otpion 1: strip only existed numbers
    # stripped=[token for token in stripped if token.isalpha()]
    
    # otpion 2: transform "three" to "3" and also strip it from the sentence
    # 5.1 Strip Dates
    stripped_string_format=' '.join(stripped_white_space)
    stripped_date_string=re.sub(r'\w*\d\w*', "", stripped_string_format).strip()
    
    # 5.2 From text to numeric form and delete
    doc=nlp(stripped_date_string)
    tokens = [w2n.word_to_num(token.text) if token.pos_ == 'NUM' and token.text not in ['N', 'm', 'V'] else token for token in doc]
    
    stripped = [i.text if not str(i).isnumeric() else str(i) for i in tokens]
    tokens_white_space_stripped_again=[w.strip() for w in stripped]

    stripped_no_numbers = [i for i in tokens_white_space_stripped_again if not i.isnumeric()]
    
    stripped_no_numbers = list(filter(None, stripped_no_numbers))
    
    #------------------------------------------------
    
    # 6.Remove stop words
    stop_words=text.ENGLISH_STOP_WORDS.union(["book"])
    
    no_stopword_text=[word for word in stripped_no_numbers if not word.lower() in stop_words]
    
    no_stopword_text = ' '.join(no_stopword_text) #i joined the text once more because a new lemmatizing approach is implemented below
    #------------------------------------------------
    
    # 7.Lemmatization
    lemmatizer = WordNetLemmatizer()
    
    #approach 1: lemmatized_text = [lemmatizer.lemmatize(word, pos='v') for word in stripped]
    #approach 1 was used until 21.02.2020, although we observed that only some of the tokens were lemmatized while others not.
    #Thus, we developed an alternative approach like below to lemmatize as many tokens/words as possible
    
    #approach 2 developed on 22.02.2020:
    lemmatized_text = [lemmatizer.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else lemmatizer.lemmatize(i) for i,j in pos_tag(word_tokenize(no_stopword_text))]

    #------------------------------------------------
    
    # 8.Lowercase text
    lowercase_text = [word.lower() for word in lemmatized_text]

    #------------------------------------------------
    lowercase_text=' '.join(lowercase_text)
    return lowercase_text

def preprocess_text_reviews(raw_text):
    
    # 1.Convert Accented Characters to ASCII
    raw_text = unidecode.unidecode(raw_text)

    # 2.Expand Contractions
    raw_text_decontracted=decontracted(raw_text)

    #------------------------------------------------

    # 3.Remove punctuation
    re_punc=re.compile('[%s]' % re.escape(string.punctuation))

    stripped_punctuation=[re_punc.sub(' ', w) for w in raw_text_decontracted.split(' ')]

    #------------------------------------------------

    # 4.Strip white space
    stripped_white_space=[w.strip() for w in stripped_punctuation]

    #------------------------------------------------    
    # 5.Remove numbers
    # otpion 1: strip only existed numbers
    # stripped=[token for token in stripped if token.isalpha()]
    
    # otpion 2: Strip numerical text
    # 5.1 Strip Dates
    stripped_string_format=' '.join(stripped_white_space)
    stripped_date_string=re.sub(r'\w*\d\w*', "", stripped_string_format).strip()

    tokens=stripped_date_string.split(' ')
    
    tokens_white_space_stripped_again=[w.strip() for w in tokens]
    
    stripped_no_numbers = [i for i in tokens_white_space_stripped_again if not i.isnumeric()]
    
    stripped_no_numbers = list(filter(None, stripped_no_numbers))
    #------------------------------------------------
    
    # 6.Remove stop words
    stop_words=text.ENGLISH_STOP_WORDS.union(["book"])
    
    no_stopword_text=[word for word in stripped_no_numbers if not word.lower() in stop_words]
    
    no_stopword_text = ' '.join(no_stopword_text) #i joined the text once more because a new lemmatizing approach is implemented below

    #------------------------------------------------
    
    # 7.Lemmatization
    lemmatizer = WordNetLemmatizer()
    
    #approach 1: lemmatized_text = [lemmatizer.lemmatize(word, pos='v') for word in stripped]
    #approach 1 was used until 21.02.2020, although we observed that only some of the tokens were lemmatized while others not.
    #Thus, we developed an alternative approach like below to lemmatize as many tokens/words as possible
    
    #approach 2 developed on 22.02.2020:
    lemmatized_text = [lemmatizer.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else lemmatizer.lemmatize(i) for i,j in pos_tag(word_tokenize(no_stopword_text))]
   
    #------------------------------------------------
    
    # 8.Lowercase text
    lowercase_text = [word.lower() for word in lemmatized_text]

    lowercase_text=' '.join(lowercase_text)
    
    return lowercase_text

#----------------------------------------------------------------------------------------

def transform_actors(column_name, dataset):
    
    tqdm.pandas()
    
    dataset.loc[:, 'clean_actors'] = dataset.loc[:, column_name].progress_apply(lambda x: x.lower()) #if column "actors_unified" is used. Because this transformation is applied on a single element and not on list elements.

def transform_plot(column_name, dataset):
    
    tqdm.pandas()
    
    dataset.loc[:, 'clean_plot_summary'] = dataset.loc[:, column_name].progress_apply(lambda x: preprocess_text_movie_content(x))

def transform_features(column_name, dataset):
    
    tqdm.pandas()
    
    dataset.loc[:, 'clean_combined_features'] = dataset.loc[:, column_name].progress_apply(lambda x: preprocess_text_movie_content(x))
    
def transform_reviews(column_name, dataset):
    
    tqdm.pandas()
    
    dataset.loc[:, 'clean_reviews'] = dataset.loc[:, column_name].progress_apply(lambda x: preprocess_text_reviews(x))

def transform_movie_title(column_name, dataset): # added on 12.07.2020 in an attempt to plot similar movies
    
    tqdm.pandas()
    
    dataset.loc[:, 'clean_movie_title'] = dataset.loc[:, column_name].progress_apply(lambda x: x.lower())

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

#Function 2

def split_dataset(method, labels, dataset, split_ratio):
    """
    Random shuffle split, with an option to split it into a stratified manner.
    However, when the stratified method was tested it didn't work out.
    
    Thus, we created a second function using the StratifiedShuffleSplit of the sklearn module.
    """
    #As mentioned earler "reduced genres" are now used and NOT the column "genres"
    X = dataset[['title', 'clean_actors', 'clean_plot_summary', 'clean_combined_features', 'clean_reviews', 'clean_movie_title', 'reduced_genres']]
    
    y = labels
    
    if method=="stratified":
    
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio, random_state=123, shuffle= True, stratify=y)
    
    else:
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio, random_state=123, shuffle= True)

    return X_train, X_test, y_train, y_test

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# Function 3

def keras_tokenization(variable, maximum_words, dataset, x_train, x_test):
    """
    The keras tokenization method that will transform a sentence of text to a sequence of tokens, mapping each token to an index. An Out-of-Vocabulary token is also created, to map the word not having an integer index.
    """
    #The tokenizer class has some main buggs when the word index mapping is created. So the function is assembled based on this GitHub post https://github.com/keras-team/keras/issues/8092#issuecomment-372833486
    
    if variable == "actors":
        
        actors_tokenizer = Tokenizer(num_words=maximum_words + 1,  filters=",", lower=True, split=',', oov_token = '<OOV>')

        actors_tokenizer.fit_on_texts(list(dataset.loc[:, 'clean_actors']))
        
        print("Maximum length of unique tokens is: {0}".format(len(actors_tokenizer.word_index)))
        
        words_to_tokenize=int(np.ceil(len(actors_tokenizer.word_index)*0.95))
        
        print("Number of words to be tokenized is the 95% of those unique tokens, equal to: {0}\nThe rest 5% or {1} is not tokenized".format(words_to_tokenize, (len(actors_tokenizer.word_index)-words_to_tokenize)))

        actors_tokenizer.word_index = {e:i for e,i in actors_tokenizer.word_index.items() if i <= maximum_words}
        
        actors_tokenizer.word_index = {x.strip(): v for x, v in actors_tokenizer.word_index.items()}
        
        actors_tokenizer.word_index[actors_tokenizer.oov_token] = maximum_words + 1

        print("Number of words mapped: {0}".format(len(actors_tokenizer.word_index)))

        x_train.loc[:, 'actors_seqs'] = actors_tokenizer.texts_to_sequences(x_train.loc[:, 'clean_actors'])
    
        x_test.loc[:, 'actors_seqs'] = actors_tokenizer.texts_to_sequences(x_test.loc[:, 'clean_actors'])

        vocabulary_size_frequent_words = len(actors_tokenizer.word_index)
        
        try:
            assert len(actors_tokenizer.word_index)==maximum_words
        except AssertionError:
            print("ERROR: The length of the vocabulary is not equal to the number of word_index dictionary, but rather is equal to: {0}\n\nCorrect length: {1}".format(len(actors_tokenizer.word_index), maximum_words))
        
        tokenizer = actors_tokenizer
            
    elif variable == "plot":
        
        plot_tokenizer = Tokenizer(num_words=maximum_words + 1, filters=" ", lower=True, split=' ', oov_token = '<OOV>')
        
        plot_tokenizer.fit_on_texts(list(dataset.loc[:, 'clean_plot_summary']))
        
        print("Maximum length of unique tokens is: {0}".format(len(plot_tokenizer.word_index)))
        
        words_to_tokenize=int(np.ceil(len(plot_tokenizer.word_index)*0.95))
        
        print("Number of words to be tokenized is the 95% of those unique tokens, equal to: {0}\nThe rest 5% or {1} tokens are not tokenized".format(words_to_tokenize, (len(plot_tokenizer.word_index)-words_to_tokenize)))
        
        plot_tokenizer.word_index = {e:i for e,i in plot_tokenizer.word_index.items() if i <= maximum_words}
        
        plot_tokenizer.word_index = {x.strip(): v for x, v in plot_tokenizer.word_index.items()}
        
        plot_tokenizer.word_index[plot_tokenizer.oov_token] = maximum_words + 1
        
        print("Number of words mapped: {0}".format(len(plot_tokenizer.word_index)))

        x_train.loc[:, 'plot_summary_seqs'] = plot_tokenizer.texts_to_sequences(x_train.loc[:, 'clean_plot_summary'])
        
        x_test.loc[:, 'plot_summary_seqs'] = plot_tokenizer.texts_to_sequences(x_test.loc[:, 'clean_plot_summary'])

        vocabulary_size_frequent_words = len(plot_tokenizer.word_index)
        
        try:
            assert len(plot_tokenizer.word_index)==maximum_words
        except AssertionError:
            print("ERROR: The length of the vocabulary is not equal to the number of word_index dictionary, but rather equal to: {0}\n\nCorrect length: {1}".format(len(plot_tokenizer.word_index), maximum_words))

        tokenizer = plot_tokenizer
        
    elif variable == "features":
        
        combined_features_tokenizer = Tokenizer(num_words=maximum_words + 1, filters=" ", lower=True, split=' ', oov_token = '<OOV>')
        
        combined_features_tokenizer.fit_on_texts(list(dataset.loc[:, 'clean_combined_features']))
        
        print("Maximum length of unique tokens is: {0}".format(len(combined_features_tokenizer.word_index)))
        
        words_to_tokenize=int(np.ceil(len(combined_features_tokenizer.word_index)*0.95))
        
        print("Number of words to be tokenized is the 95% of those unique tokens, equal to: {0}\nThe rest 5% or {1} is not tokenized".format(words_to_tokenize, (len(combined_features_tokenizer.word_index)-words_to_tokenize)))

        combined_features_tokenizer.word_index = {e:i for e,i in combined_features_tokenizer.word_index.items() if i <= maximum_words}
        
        combined_features_tokenizer.word_index = {x.strip(): v for x, v in combined_features_tokenizer.word_index.items()}
        
        combined_features_tokenizer.word_index[combined_features_tokenizer.oov_token] = maximum_words + 1
        
        print("Number of words mapped: {0}".format(len(combined_features_tokenizer.word_index)))

        x_train.loc[:, 'combined_features_seqs'] = combined_features_tokenizer.texts_to_sequences(x_train.loc[:, 'clean_combined_features'])
        
        x_test.loc[:, 'combined_features_seqs'] = combined_features_tokenizer.texts_to_sequences(x_test.loc[:, 'clean_combined_features'])

        vocabulary_size_frequent_words = len(combined_features_tokenizer.word_index)
        
        try:
            assert len(combined_features_tokenizer.word_index)==maximum_words
        except AssertionError:
            print("ERROR: The length of the vocabulary is not equal to the number of word_index dictionary, but rather equal to: {0}\n\nCorrect length: {1}".format(len(combined_features_tokenizer.word_index), maximum_words))

        tokenizer = combined_features_tokenizer
        
    elif variable == "reviews":
        
        reviews_tokenizer = Tokenizer(num_words=maximum_words + 1, lower=True, filters=" ", split=' ', oov_token = '<OOV>')
        
        reviews_tokenizer.fit_on_texts(dataset.loc[:, 'clean_reviews'])
        
        print("Maximum length of unique tokens is: {0}".format(len(reviews_tokenizer.word_index)))
        
        words_to_tokenize=int(np.ceil(len(reviews_tokenizer.word_index)*0.95))
        
        print("Number of words to be tokenized is the 95% of those unique tokens, equal to: {0}\nThe rest 5% or {1} is not tokenized".format(maximum_words, (len(reviews_tokenizer.word_index)-words_to_tokenize)))

        reviews_tokenizer.word_index = {e:i for e,i in reviews_tokenizer.word_index.items() if i <= maximum_words}
        
        reviews_tokenizer.word_index = {x.strip(): v for x, v in reviews_tokenizer.word_index.items()}

        reviews_tokenizer.word_index[reviews_tokenizer.oov_token] = maximum_words + 1
        
        print("Number of words mapped: {0}".format(len(reviews_tokenizer.word_index)))

        x_train.loc[:, 'reviews_seqs'] = reviews_tokenizer.texts_to_sequences(x_train.loc[:, 'clean_reviews'])
        
        x_test.loc[:, 'reviews_seqs'] = reviews_tokenizer.texts_to_sequences(x_test.loc[:, 'clean_reviews'])

        vocabulary_size_frequent_words = len(reviews_tokenizer.word_index)
        
        try:
            assert len(reviews_tokenizer.word_index)==maximum_words
        except AssertionError:
            print("ERROR: The length of the vocabulary is not equal to the number of word_index dictionary, but rather equal to: {0}\n\nCorrect length: {1}".format(len(reviews_tokenizer.word_index), maximum_words))

        tokenizer = reviews_tokenizer
        
    elif  variable == "movie title":
        
        movie_title_tokenizer = Tokenizer(num_words=maximum_words + 1, lower=True, filters=" ", split=' ', oov_token = '<OOV>')

        movie_title_tokenizer.fit_on_texts(dataset.loc[:, 'clean_movie_title'])
        
        print("Maximum length of unique tokens is: {0}".format(len(movie_title_tokenizer.word_index)))
        
        words_to_tokenize=int(np.ceil(len(movie_title_tokenizer.word_index)*0.95))
        
        print("Number of words to be tokenized is the 95% of those unique tokens, equal to: {0}\nThe rest 5% or {1} is not tokenized".format(maximum_words, (len(movie_title_tokenizer.word_index)-words_to_tokenize)))

        movie_title_tokenizer.word_index = {e:i for e,i in movie_title_tokenizer.word_index.items() if i <= maximum_words}

        movie_title_tokenizer.word_index = {x.strip(): v for x, v in movie_title_tokenizer.word_index.items()}

        movie_title_tokenizer.word_index[movie_title_tokenizer.oov_token] = maximum_words + 1
        
        print("Number of words mapped: {0}".format(len(movie_title_tokenizer.word_index)))
        
        x_train.loc[:, 'movie_title_seqs'] = movie_title_tokenizer.texts_to_sequences(x_train.loc[:, 'clean_movie_title'])
        
        x_test.loc[:, 'movie_title_seqs'] = movie_title_tokenizer.texts_to_sequences(x_test.loc[:, 'clean_movie_title'])

        vocabulary_size_frequent_words = len(movie_title_tokenizer.word_index)
        
        try:
            assert len(movie_title_tokenizer.word_index)==maximum_words
        except AssertionError:
            print("ERROR: The length of the vocabulary is not equal to the number of word_index dictionary, but rather equal to: {0}\n\nCorrect length: {1}".format(len(movie_title_tokenizer.word_index), maximum_words))

        tokenizer = movie_title_tokenizer
        
    return vocabulary_size_frequent_words, tokenizer

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

# Function 4

def mean(numbers):
    return int(np.ceil(float(sum(numbers)) / max(len(numbers), 1)))

def padding_sequnce_length(variable, x_train, x_test):
    """
    Find the maximum length of the sequences belonging to a column. The maximum length of the column's sequence is equal to the 95% length of all the sequences. So for example if the dataset has 10,000 sequences and 9,500
    of them have length 20 then all the sequences will either be cropped or extended to 20 integers.
    """
    if variable == "actors":
    
        all_train_lengths =  list(x_train.actors_seqs.apply(len))
        all_test_lengths =  list(x_test.actors_seqs.apply(len))

        maxlen_train = int(np.percentile(all_train_lengths, q=95))
        maxlen_test = int(np.percentile(all_test_lengths, q=95))
        
        if maxlen_train!=maxlen_test:
            maxlen_value=mean([maxlen_train, maxlen_test])
        else:
            maxlen_value=maxlen_train
        
        print("Max Length of the pad sequence for Actors: {0}".format(maxlen_value))
        
    elif variable == "plot":
        
        all_train_lengths =  list(x_train.plot_summary_seqs.apply(len))
        all_test_lengths =  list(x_test.plot_summary_seqs.apply(len))

        maxlen_train = int(np.percentile(all_train_lengths, q=95))
        maxlen_test = int(np.percentile(all_test_lengths, q=95))
        
        if maxlen_train!=maxlen_test:
            maxlen_value=mean([maxlen_train, maxlen_test])
        else:
            maxlen_value=maxlen_train
        
        print("Max Length of the pad sequence for Plot Summary: {0}".format(maxlen_value))

    elif variable == "features":
        
        all_train_lengths =  list(x_train.combined_features_seqs.apply(len))
        all_test_lengths =  list(x_test.combined_features_seqs.apply(len))

        maxlen_train = int(np.percentile(all_train_lengths, q=95))
        maxlen_test = int(np.percentile(all_test_lengths, q=95))
        
        if maxlen_train!=maxlen_test:
            maxlen_value=mean([maxlen_train, maxlen_test])
        else:
            maxlen_value=maxlen_train
        
        print("Max Length of the pad sequence for Movie Features: {0}".format(maxlen_value))
        
    elif variable == "reviews":
        
        all_train_lengths =  list(x_train.reviews_seqs.apply(len))
        all_test_lengths =  list(x_test.reviews_seqs.apply(len))

        maxlen_train = int(np.percentile(all_train_lengths, q=95))
        maxlen_test = int(np.percentile(all_test_lengths, q=95))
        
        if maxlen_train!=maxlen_test:
            maxlen_value=mean([maxlen_train, maxlen_test])
        else:
            maxlen_value=maxlen_train
        
        print("Max Length of the pad sequence for Movie Reviews: {0}".format(maxlen_value))
        
    elif variable == "movie title":

        all_train_lengths =  list(x_train.movie_title_seqs.apply(len))
        all_test_lengths =  list(x_test.movie_title_seqs.apply(len))

        maxlen_train = int(np.percentile(all_train_lengths, q=95))
        maxlen_test = int(np.percentile(all_test_lengths, q=95))
        
        if maxlen_train!=maxlen_test:
            maxlen_value=mean([maxlen_train, maxlen_test])
        else:
            maxlen_value=maxlen_train
        
        print("Max Length of the pad sequence for Movie Title: {0}".format(maxlen_value))
        
    return maxlen_value

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

# Function 5

# the input data for a deep learning model must be a single tensor (of shape e.g. (batch_size, 6, vocab_size), 
# samples that are shorter than the longest item need to be padded with some placeholder value.

#url https://www.tensorflow.org/guide/keras/masking_and_padding

def padding_sequence(variable, x_train, x_test, y_train, y_test, maxlen):
    """
    Apply the padding based on the maximum length retrieved from the function 4.1
    """
    if variable == "actors":
        
        x_train_seq = pad_sequences(x_train.loc[:, 'actors_seqs'], padding='post', maxlen=maxlen)
    
        x_test_seq = pad_sequences(x_test.loc[:, 'actors_seqs'], padding='post', maxlen=maxlen)
        
        assert len(x_train_seq) == len(y_train) # x_train_seq

        assert len(x_test_seq) == len(y_test) # x_test_seq
        
    elif variable == "plot":
        
        x_train_seq = pad_sequences(x_train.loc[:, 'plot_summary_seqs'], padding='post', maxlen=maxlen)
    
        x_test_seq = pad_sequences(x_test.loc[:, 'plot_summary_seqs'], padding='post', maxlen=maxlen)

        assert len(x_train_seq) == len(y_train)

        assert len(x_test_seq) == len(y_test)
        
    elif variable == "features":
        
        x_train_seq = pad_sequences(x_train.loc[:, 'combined_features_seqs'], padding='post', maxlen=maxlen)
    
        x_test_seq = pad_sequences(x_test.loc[:, 'combined_features_seqs'], padding='post', maxlen=maxlen)

        assert len(x_train_seq) == len(y_train)

        assert len(x_test_seq) == len(y_test)
        
    elif variable == "reviews":
        
        x_train_seq = pad_sequences(x_train.loc[:, 'reviews_seqs'], padding='post', maxlen=maxlen)
    
        x_test_seq = pad_sequences(x_test.loc[:, 'reviews_seqs'], padding='post', maxlen=maxlen)

        assert len(x_train_seq) == len(y_train)

        assert len(x_test_seq) == len(y_test)
        
    elif variable == "movie title":
        
        x_train_seq = pad_sequences(x_train.loc[:, 'movie_title_seqs'], padding='post', maxlen=maxlen)
    
        x_test_seq = pad_sequences(x_test.loc[:, 'movie_title_seqs'], padding='post', maxlen=maxlen)

        assert len(x_train_seq) == len(y_train)

        assert len(x_test_seq) == len(y_test)
        
    return x_train_seq, x_test_seq

In [None]:
"""
Previously we experinced an error using the stratified sampling. Below we printed the number of genre sequences that are assigned to only one movie.
For those 131 movies the stratified sampling is failing to complete.
Thus, we should find their indexes and remove them. The final dataset should contain 49122-131=48991
"""
list_of_movies_to_remove=None
print("Number of movies that are assigned to only 1 sequence of genres: ", len(dataset_frequent_genres["reduced_genres"].apply(tuple).value_counts()[dataset_frequent_genres["reduced_genres"].apply(tuple).value_counts()==1]), '\n')
list_of_movies_to_remove=dataset_frequent_genres["reduced_genres"].apply(tuple).value_counts()[dataset_frequent_genres["reduced_genres"].apply(tuple).value_counts()==1].index.tolist()
list_of_movies_to_remove=[list(x) for x in list_of_movies_to_remove]
assert type(list_of_movies_to_remove[0]) is list

In [None]:
"""
Below are the indexes of rows that should be removed from the dataset. In total 131 indexes.
With those final 48991 rows of the dataset, the stratified sampling will be successfully completed.
"""
indexes_to_remove=dataset_frequent_genres['reduced_genres'].map(lambda x: 1 if list(x) in list_of_movies_to_remove else 0)[dataset_frequent_genres['reduced_genres'].map(lambda x: 1 if x in list_of_movies_to_remove else 0)==1].index.tolist()
dataset_frequent_genres=dataset_frequent_genres[~dataset_frequent_genres.index.isin(indexes_to_remove)]
dataset_frequent_genres=dataset_frequent_genres.reset_index(drop=True)
dataset_frequent_genres.shape

In [None]:
year_list=dataset_frequent_genres['year'].values.tolist()

with open('pickled_data_per_part\\year_list_{0}.pkl'.format(version_data_control), 'wb') as f:
    pickle.dump(year_list, f)

In [None]:
movie_title_list=dataset_frequent_genres['title'].values.tolist()

with open('pickled_data_per_part\\movie_title_list_{0}.pkl'.format(version_data_control), 'wb') as f:
    pickle.dump(movie_title_list, f)

In [None]:
"""
This code shell may not be executed since it's already pickled after the transformation functions have been applied to each
column, which will be later used as model input.
"""
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Transfrom the columns:
# -> Actors
# -> Plot summary
# -> Movie Features
# -> Reviews
# -> Movie Title

print("Start Execution")
begin_time=time.time()
print("---------------------------------------------------------------------------------\n")
print("Transfrom the column of the actors")
start_time_one=time.time()
transform_actors("actors_unified", dataset_frequent_genres) # function 3: transform_actors
print("Finished the actors transformation after: {0}\n".format(format_timespan(time.time()-start_time_one)))

print("Transfrom the column of the plot summary")
start_time_two=time.time()
transform_plot("plot", dataset_frequent_genres) # function 3: transform_plot
print("Finished the plot transformation after: {0}\n".format(format_timespan(time.time()-start_time_two)))

print("Transfrom the column of the movie features")
start_time_three=time.time()
transform_features("movie_features", dataset_frequent_genres) # function 3: transform_features
print("Finished the movie_features transformation after: {0}\n".format(format_timespan(time.time()-start_time_three)))

print("Transfrom the column of the movie reviews")
start_time_four=time.time()
transform_reviews("reviews_pruned", dataset_frequent_genres) # function 3: transform_reviews
print("Finished the reviews_pruned transformation after: {0}".format(format_timespan(time.time()-start_time_four)))

print("Transfrom the column of the movie title")
start_time_five=time.time()
transform_movie_title("title", dataset_frequent_genres)
print("Finished the movie title transformation after: {0}".format(format_timespan(time.time()-start_time_five)))

print("---------------------------------------------------------------------------------\n")
print("Finished Execution after: {0}".format(format_timespan(time.time()-begin_time)))

# Total time to transform the columns: 30 minutes and 35.95 seconds on CPU: i7 9th Generation and GPU: NVidia 1660Ti

In [None]:
# Now the movies do not contain actors with names of single, 2, or 3 letters
mask=dataset_frequent_genres.clean_actors.str.split(",").explode().str.len().eq(4)
res=dataset_frequent_genres[['title', 'clean_actors']].loc[np.unique(mask.loc[mask].index)]
display(res)

In [None]:
"""
Before pre-processing the raw text of the first review about Toy Story. Text has been many stop words, punctuations and words in many different tense!
"""
dataset_frequent_genres['reviews'].iloc[0][0]

In [None]:
"""
After pre-processing the raw text of the first review about Toy Story. Text has been lemmatized and cleaned off most of the noise!
"""
dataset_frequent_genres['clean_reviews'].iloc[0]

#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Option 1: Stratified Shuffle Split using the train_test_split function (train, test)

In [None]:
"""
This is the first way to split the dataset, by using the random shuffle split of the Train_test_split function offered by sklearn module
This version was the first to be developed and followed, however we decided to try a second more robust option.
The second option refers to the data separation into train, validation and test set using the StratifiedShuffleSPlit function developed and mainted by sklearn module.

In cases of imbalanced datasets and specifically for classification models, the stratification comes in handy because it ensures that the data will be splitted uniformly and both the train and test sets, will enclude all the categorical variables.
"""
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Split the dataset into train & set sets
print("\n---------------------------------------------------------------------------------")
print("\nSplit the dataset into train & test sets (stratified shuffle split)\n")
start_time=time.time()
X_train, X_test, y_train, y_test = split_dataset("stratify", dataset_frequent_genres.iloc[:, 13:30], dataset_frequent_genres, 0.2)
print("Finished the plot transformation after: {0}".format(format_timespan(time.time()-start_time)))

In [None]:
"""
The shape of the X_train, X_test, y_train, y_test splitted and shuffled randomly
"""
print("X_train shape:{}".format(X_train.shape))
print("X_test shape:{}".format(X_test.shape))
print("y_train shape:{}".format(y_train.shape))
print("y_test shape:{}".format(y_test.shape))

In [None]:
round(dataset_frequent_genres.reduced_genres.explode().value_counts(normalize=True)*100,3)

In [None]:
"""
The stratification worked!
"""
round(X_train.reduced_genres.explode().value_counts(normalize=True)*100,3)

In [None]:
"""
The stratification worked!
"""
round(X_test.reduced_genres.explode().value_counts(normalize=True)*100,3)

In [None]:
"""
The below cell serialises the X_train, X_test, y_train, y_test inputs created by the stratified split.
"""
X_train.to_pickle(os.path.join(os.getcwd(), 'pickled_data_per_part\\X_train_all_inputs_{0}.pkl'.format(version_data_control)))
X_test.to_pickle(os.path.join(os.getcwd(), 'pickled_data_per_part\\X_test_all_inputs_{0}.pkl'.format(version_data_control)))
y_train.to_pickle(os.path.join(os.getcwd(), 'pickled_data_per_part\\y_train_all_inputs_{0}.pkl'.format(version_data_control)))
y_test.to_pickle(os.path.join(os.getcwd(), 'pickled_data_per_part\\y_test_all_inputs_{0}.pkl'.format(version_data_control)))

#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

In [None]:
X_train=pd.read_pickle(os.path.join(os.getcwd(), 'pickled_data_per_part\\X_train_all_inputs_{0}.pkl'.format(version_data_control)))
X_test=pd.read_pickle(os.path.join(os.getcwd(), 'pickled_data_per_part\\X_test_all_inputs_{0}.pkl'.format(version_data_control)))
y_train=pd.read_pickle(os.path.join(os.getcwd(), 'pickled_data_per_part\\y_train_all_inputs_{0}.pkl'.format(version_data_control)))
y_test=pd.read_pickle(os.path.join(os.getcwd(), 'pickled_data_per_part\\y_test_all_inputs_{0}.pkl'.format(version_data_control)))

In [None]:
"""
Now that the data is splitted, we separated each column of interest to a different X_train and X_test
Those train and text X sets will be later used for tokenization and padding
"""
# Separate each different input column (actors, plot, features, reviews, title)

X_train_actors = X_train[["title", "clean_actors", "reduced_genres"]]
X_train_plot = X_train[["title", "clean_plot_summary", "reduced_genres"]]
X_train_features = X_train[["title", "clean_combined_features", "reduced_genres"]]
X_train_reviews = X_train[["title", "clean_reviews", "reduced_genres"]]
X_train_title = X_train[["title", "clean_movie_title", "reduced_genres"]]

# In X_train and X_test I also use columns "title" and "genres" since they will be both used later for making inference with predictions
assert X_train_actors.shape==X_train_plot.shape==X_train_features.shape==X_train_reviews.shape==X_train_title.shape

X_test_actors = X_test[["title", "clean_actors", "reduced_genres"]]
X_test_plot = X_test[["title", "clean_plot_summary", "reduced_genres"]]
X_test_features = X_test[["title", "clean_combined_features", "reduced_genres"]]
X_test_reviews = X_test[["title", "clean_reviews", "reduced_genres"]]
X_test_title = X_test[["title", "clean_movie_title", "reduced_genres"]]

assert X_test_actors.shape==X_test_plot.shape==X_test_features.shape==X_test_reviews.shape==X_test_title.shape

In [None]:
# Now the movies do not contain actors with names of single, 2, or 3 letters
mask=X_test_actors.clean_actors.str.split(",").explode().str.len().eq(4)
res=X_test_actors[['title', 'clean_actors']].loc[np.unique(mask.loc[mask].index)]
display(res)

#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Token Frequency to determine best value for MAX_FREQUENCY_WORDS used later in word tokenization

In [None]:
%%time
"""
Find the most frequent words among the actor names
In the end the number of rows will be equal to the number of maximum features tokenized by the Tokenizer.
-> Probably the infrequent tokens will make a better classification
"""
def actors_split(s):
    return s.split(',')

corpus_actors=dataset_frequent_genres['clean_actors'].values.tolist()
c_vectorizer=CountVectorizer(tokenizer=actors_split, min_df=1) #keep this to 1 to include all the words/tokens

c_vectorizer.fit(corpus_actors)
print("Vocabulary length of CountVectorizer of the actors corpus: {0}".format(len(c_vectorizer.vocabulary_)))

X=c_vectorizer.fit_transform(corpus_actors)
X_words=c_vectorizer.inverse_transform(X)

tokens_list=c_vectorizer.get_feature_names()
count_list = np.asarray(X.sum(axis=0)).ravel().tolist()

token_frequency_df=pd.DataFrame({'term': c_vectorizer.get_feature_names(), 'token_frequency': count_list})
token_frequency_df=token_frequency_df.sort_values(by='token_frequency', ascending=False)
print(token_frequency_df.shape)
#token_frequency_df_pruned_actors=token_frequency_df[token_frequency_df['token_frequency']>=3]
#print(token_frequency_df_pruned_actors.shape)

# The below code sample creates a dictionary that shows only the n-frequent actors
actors_frequency_dictionary=dict(zip(tokens_list, count_list))
d_actors = dict((k, v) for k, v in actors_frequency_dictionary.items() if v >= 80) # v = popularity (frequency) or number of movies played

# sorted(d_actors.items(), key=lambda kv: kv[1], reverse=True)

In [None]:
print("The total number of actors that exist in the dataset is: {}".format(len(c_vectorizer.vocabulary_)))
actors_tokenized=int(np.ceil(len(c_vectorizer.vocabulary_)*0.95))
print("The 95% ({0}) of the actors will be tokenized and the rest 5% ({1}) of the actors will be removed due to sparsity".format(actors_tokenized, (len(c_vectorizer.vocabulary_)-actors_tokenized)))

In [None]:
%%time
"""
Find the most frequent words among the movie plots
In the end the number of rows will be equal to the number of maximum features tokenized by the Tokenizer.
"""
def plot_split(s):
    return s.split(' ')

corpus_plot=dataset_frequent_genres['clean_plot_summary'].values.tolist()
c_vectorizer=CountVectorizer(tokenizer=plot_split, min_df=1)

c_vectorizer.fit(corpus_plot)
print("Vocabulary length of CountVectorizer of the plot corpus: {0}".format(len(c_vectorizer.vocabulary_)))

X=c_vectorizer.fit_transform(corpus_plot)
X_words=c_vectorizer.inverse_transform(X)

tokens_list=c_vectorizer.get_feature_names()
count_list = np.asarray(X.sum(axis=0)).ravel().tolist()

token_frequency_df_plot=pd.DataFrame({'term': c_vectorizer.get_feature_names(), 'token_frequency': count_list})
token_frequency_df_plot.reset_index(drop=True)
token_frequency_df_plot=token_frequency_df_plot.sort_values(by='token_frequency', ascending=False)
print(token_frequency_df_plot.shape)
#token_frequency_df_pruned_plot=token_frequency_df[token_frequency_df['token_frequency']>=3]
#print(token_frequency_df_pruned_plot.shape)

# The below code sample creates a dictionary that shows only the n-frequent actors
plot_frequency_dictionary=dict(zip(tokens_list, count_list))
d_plot = dict((k, v) for k, v in plot_frequency_dictionary.items() if v >= 1)

In [None]:
print("The total number of plot tokens that exist in the dataset is: {}".format(len(c_vectorizer.vocabulary_)))
plot_words_tokenized=int(np.ceil(len(c_vectorizer.vocabulary_)*0.95))
print("The 95% ({0}) of the plot summary tokens will be tokenized and the rest 5% ({1}) of the actors will be removed due to sparsity".format(plot_words_tokenized, (len(c_vectorizer.vocabulary_)-plot_words_tokenized)))

In [None]:
%%time
"""
Find the most frequent words among the movie features
In the end the number of rows will be equal to the number of maximum features tokenized by the Tokenizer.
"""
def movie_features_split(s):
    return s.split(' ')

corpus_features=dataset_frequent_genres['clean_combined_features'].values.tolist()
c_vectorizer=CountVectorizer(tokenizer=movie_features_split, min_df=2)

c_vectorizer.fit(corpus_features)
print("Vocabulary length of CountVectorizer of the features corpus: {0}".format(len(c_vectorizer.vocabulary_)))

X=c_vectorizer.fit_transform(corpus_features)
X_words=c_vectorizer.inverse_transform(X)

tokens_list=c_vectorizer.get_feature_names()
count_list = np.asarray(X.sum(axis=0)).ravel().tolist()

token_frequency_df=pd.DataFrame({'term': c_vectorizer.get_feature_names(), 'token_frequency': count_list})
token_frequency_df=token_frequency_df.sort_values(by='token_frequency', ascending=False)
print(token_frequency_df.shape)
#token_frequency_df_pruned_features=token_frequency_df[token_frequency_df['token_frequency']>=3]
#print(token_frequency_df_pruned_features.shape)

# The below code sample creates a dictionary that shows only the n-frequent actors
features_frequency_dictionary=dict(zip(tokens_list, count_list))
d_features = dict((k, v) for k, v in features_frequency_dictionary.items() if v >= 90)

# sorted(d_features.items(), key=lambda kv: kv[1], reverse=True)

In [None]:
features_frequency_dictionary['drama']

In [None]:
print("The total number of feature tokens that exist in the dataset is: {}".format(len(c_vectorizer.vocabulary_)))
features_words_tokenized=int(np.ceil(len(c_vectorizer.vocabulary_)*0.95))
print("The 95% ({0}) of the feature tokens will be tokenized and the rest 5% ({1}) of the actors will be removed due to sparsity".format(features_words_tokenized, (len(c_vectorizer.vocabulary_)-features_words_tokenized)))

In [None]:
%%time
"""
Find the most frequent words among the movie reviews
In the end the number of rows will be equal to the number of maximum features tokenized by the Tokenizer.
"""
def reviews_split(s):
    return s.split(' ')

corpus_reviews=dataset_frequent_genres['clean_reviews'].values.tolist()
c_vectorizer=CountVectorizer(tokenizer=reviews_split, min_df=1)

c_vectorizer.fit(corpus_reviews)
print("Vocabulary length of CountVectorizer of the reviews corpus: {0}".format(len(c_vectorizer.vocabulary_)))

X=c_vectorizer.fit_transform(corpus_reviews)
X_words=c_vectorizer.inverse_transform(X)

tokens_list=c_vectorizer.get_feature_names()
count_list = np.asarray(X.sum(axis=0)).ravel().tolist()

token_frequency_df=pd.DataFrame({'term': c_vectorizer.get_feature_names(), 'token_frequency': count_list})
token_frequency_df=token_frequency_df.sort_values(by='token_frequency', ascending=False)
print(token_frequency_df.shape)
#token_frequency_df_pruned_reviews=token_frequency_df[token_frequency_df['token_frequency']>=3]
#print(token_frequency_df_pruned_reviews.shape)

# The below code sample creates a dictionary that shows only the n-frequent actors
reviews_frequency_dictionary=dict(zip(tokens_list, count_list))
d_reviews = dict((k, v) for k, v in reviews_frequency_dictionary.items() if v >= 100)

# sorted(d_reviews.items(), key=lambda kv: kv[1], reverse=True)

In [None]:
reviews_frequency_dictionary['film']

In [None]:
print("The total number of review tokens that exist in the dataset is: {}".format(len(c_vectorizer.vocabulary_)))
reviews_words_tokenized=int(np.ceil(len(c_vectorizer.vocabulary_)*0.95))
print("The 95% ({0}) of the review tokens will be tokenized and the rest 5% ({1}) of the actors will be removed due to sparsity".format(reviews_words_tokenized, (len(c_vectorizer.vocabulary_)-reviews_words_tokenized)))

In [None]:
%%time
"""
Find the most frequent words among the movie titles
In the end the number of rows will be equal to the number of maximum features tokenized by the Tokenizer.
"""
def movie_title_split(s):
    return s.split(' ')

corpus_title=dataset_frequent_genres['clean_movie_title'].values.tolist()
c_vectorizer=CountVectorizer(tokenizer=movie_title_split, min_df=2)

c_vectorizer.fit(corpus_title)
print("Vocabulary length of CountVectorizer of the title corpus: {0}".format(len(c_vectorizer.vocabulary_)))

X=c_vectorizer.fit_transform(corpus_title)
X_words=c_vectorizer.inverse_transform(X)

tokens_list=c_vectorizer.get_feature_names()
count_list = np.asarray(X.sum(axis=0)).ravel().tolist()

token_frequency_df=pd.DataFrame({'term': c_vectorizer.get_feature_names(), 'token_frequency': count_list})
token_frequency_df=token_frequency_df.sort_values(by='token_frequency', ascending=False)
print(token_frequency_df.shape)
#token_frequency_df_pruned_title=token_frequency_df[token_frequency_df['token_frequency']>=3]
#print(token_frequency_df_pruned_title.shape)

# The below code sample creates a dictionary that shows only the n-frequent actors
title_frequency_dictionary=dict(zip(tokens_list, count_list))
d_title = dict((k, v) for k, v in title_frequency_dictionary.items() if v >= 100)

In [None]:
print("The total number of movie title tokens that exist in the dataset is: {}".format(len(c_vectorizer.vocabulary_)))
title_words_tokenized=int(np.ceil(len(c_vectorizer.vocabulary_)*0.95))
print("The 95% ({0}) of the movie title tokens will be tokenized and the rest 5% ({1}) of the actors will be removed due to sparsity".format(title_words_tokenized, (len(c_vectorizer.vocabulary_)-title_words_tokenized)))

#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

In [None]:
if os.path.exists(os.path.join(os.getcwd(), "80-20 split_non-balanced\\text_tokenization_padded_sequences_{0}".format(version_data_control))) is True:
    print("Folder already exists!\n")
else:
    print("Folder not found!\n")
    os.mkdir(os.path.join(os.getcwd(), "80-20 split_non-balanced\\text_tokenization_padded_sequences_{0}".format(version_data_control)))
    print("Folder is created!\n")

In [None]:
"""
Data tokenization is one of the most important parts when dealing with text data.
Since I am going to deploy keras models, I use the python api of Keras Tokenizer,
more details about its use on: https://keras.io/preprocessing/text/
"""
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Tokenize the dataset (using the keras tokenizer)
print("\n---------------------------------------------------------------------------------")
print("\nTokenize the dataset (using the keras tokenizer class)\n")
begin_time=time.time()

start_time_one=time.time()
print("------------\nActors corpus\n------------")
vocabulary_size_frequent_words_actors, tokenizer_actors = keras_tokenization("actors", actors_tokenized, dataset_frequent_genres, X_train_actors, X_test_actors) # function 5: keras_tokenization
print("Actors tokenized with maximum number of words: {}".format(vocabulary_size_frequent_words_actors))

# Pickle the Actors Tokenizer
with open(os.path.join(os.getcwd(), '80-20 split_non-balanced\\text_tokenization_padded_sequences_{0}\\actors_tokenizer_{1}_{2}.pkl'.format(version_data_control, str(actors_tokenized), version_data_control)), 'wb') as handle:
    pickle.dump(tokenizer_actors, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("Finished the actors corpus tokenization after: {0}\n".format(format_timespan(time.time()-start_time_one)))
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

start_time_two=time.time()
print("------------\nPlot Summary corpus\n------------")
vocabulary_size_frequent_words_plot, tokenizer_plot = keras_tokenization("plot", plot_words_tokenized, dataset_frequent_genres, X_train_plot, X_test_plot) # function 5: keras_tokenization
print("Plot Summary tokenized with maximum number of words: {}".format(vocabulary_size_frequent_words_plot))

# Pickle the Plot Tokenizer
with open(os.path.join(os.getcwd(), '80-20 split_non-balanced\\text_tokenization_padded_sequences_{0}\\plot_tokenizer_{1}_{2}.pkl'.format(version_data_control, str(plot_words_tokenized), version_data_control)), 'wb') as handle:
    pickle.dump(tokenizer_plot, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("Finished the plot corpus tokenization after: {0}\n".format(format_timespan(time.time()-start_time_two)))
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

start_time_three=time.time()
print("------------\nMovie Features corpus\n------------")
vocabulary_size_frequent_words_features, tokenizer_features = keras_tokenization("features", features_words_tokenized, dataset_frequent_genres, X_train_features, X_test_features) # function 5: keras_tokenization
print("Movie Features tokenized with maximum number of words: {}".format(vocabulary_size_frequent_words_features))

# Pickle the Movie Features Tokenizer
with open(os.path.join(os.getcwd(), '80-20 split_non-balanced\\text_tokenization_padded_sequences_{0}\\features_tokenizer_{1}_{2}.pkl'.format(version_data_control, str(features_words_tokenized), version_data_control)), 'wb') as handle:
    pickle.dump(tokenizer_features, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("Finished the movie features corpus tokenization after: {0}\n".format(format_timespan(time.time()-start_time_three)))
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

start_time_four=time.time()
print("------------\nMovie Reviews corpus\n------------")
vocabulary_size_frequent_words_reviews, tokenizer_reviews = keras_tokenization("reviews", reviews_words_tokenized, dataset_frequent_genres, X_train_reviews, X_test_reviews) # function 5: keras_tokenization
print("Movie Reviews tokenized with maximum number of words: {}".format(vocabulary_size_frequent_words_reviews))

# Pickle the Reviews Tokenizer
with open(os.path.join(os.getcwd(), '80-20 split_non-balanced\\text_tokenization_padded_sequences_{0}\\reviews_tokenizer_{1}_{2}.pkl'.format(version_data_control, str(reviews_words_tokenized), version_data_control)), 'wb') as handle:
    pickle.dump(tokenizer_reviews, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("Finished the movie reviews corpus tokenization after: {0}\n".format(format_timespan(time.time()-start_time_four)))

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

start_time_five=time.time()
print("------------\nMovie Title corpus\n------------")
vocabulary_size_frequent_words_title, tokenizer_title = keras_tokenization("movie title", title_words_tokenized, dataset_frequent_genres, X_train_title, X_test_title) # function 5: keras_tokenization
print("Movie Title tokenized with maximum number of words: {}".format(vocabulary_size_frequent_words_title))

# Pickle the Reviews Tokenizer
with open(os.path.join(os.getcwd(), '80-20 split_non-balanced\\text_tokenization_padded_sequences_{0}\\title_tokenizer_{1}_{2}.pkl'.format(version_data_control, str(title_words_tokenized), version_data_control)), 'wb') as handle:
    pickle.dump(tokenizer_title, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("Finished the movie title corpus tokenization after: {0}\n".format(format_timespan(time.time()-start_time_five)))

print("Finished tokenization of all 5 trainable columns after: {0}".format(format_timespan(time.time()-begin_time)))

In [None]:
start_time_one=time.time()
print("------------\nActors corpus\n------------")
vocabulary_size_frequent_words_actors, tokenizer_actors = keras_tokenization("actors", actors_tokenized, dataset_frequent_genres, X_train_actors, X_test_actors) # function 5: keras_tokenization
print("Actors tokenized with maximum number of words: {}".format(vocabulary_size_frequent_words_actors))

# Pickle the Actors Tokenizer
with open(os.path.join(os.getcwd(), '80-20 split_non-balanced\\text_tokenization_padded_sequences_{0}\\actors_tokenizer_{1}_{2}.pkl'.format(version_data_control, str(actors_tokenized), version_data_control)), 'wb') as handle:
    pickle.dump(tokenizer_actors, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("Finished the actors corpus tokenization after: {0}\n".format(format_timespan(time.time()-start_time_one)))

In [None]:
words_tokenized_per_trainable_feature={}
words_tokenized_per_trainable_feature['actors_tokenized']=actors_tokenized
words_tokenized_per_trainable_feature['plot_words_tokenized']=plot_words_tokenized
words_tokenized_per_trainable_feature['features_words_tokenized']=features_words_tokenized
words_tokenized_per_trainable_feature['reviews_words_tokenized']=reviews_words_tokenized
words_tokenized_per_trainable_feature['title_words_tokenized']=title_words_tokenized
words_tokenized_per_trainable_feature

In [None]:
with open(os.path.join(os.getcwd(), '80-20 split_non-balanced\\text_tokenization_padded_sequences_{0}\\words_tokenized_{0}.pkl'.format(version_data_control)), 'wb') as handle:
    pickle.dump(words_tokenized_per_trainable_feature, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
X_train_reviews.loc[:, 'reviews_seqs']

#### Comment: The three below blocks of code where executed once and then were pickled!

In [None]:
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Specify the length of the maxlen variable
print("\n---------------------------------------------------------------------------------")
print("\nSpecify the length of the maxlen variable (length is a parameter for the optimal padding execution)\n")

maxlen_actors = padding_sequnce_length("actors", X_train_actors, X_test_actors)
maxlen_plot = padding_sequnce_length("plot", X_train_plot, X_test_plot)
maxlen_features = padding_sequnce_length("features", X_train_features, X_test_features)
maxlen_reviews = padding_sequnce_length("reviews", X_train_reviews, X_test_reviews)
maxlen_title = padding_sequnce_length("movie title", X_train_title, X_test_title)

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Create the padding sequence of texts
print("\n---------------------------------------------------------------------------------")
print("\nCreate the padding sequence of texts\n")

X_train_seq_actors, X_test_seq_actors = padding_sequence("actors", X_train_actors, X_test_actors, y_train, y_test, maxlen_actors)
print("\nActors padded sequences created\n")

X_train_seq_plot, X_test_seq_plot = padding_sequence("plot", X_train_plot, X_test_plot, y_train, y_test, maxlen_plot)
print("Plot padded sequences created\n")

X_train_seq_features, X_test_seq_features = padding_sequence("features", X_train_features, X_test_features, y_train, y_test, maxlen_features)
print("Movie Features padded sequences created\n")

X_train_seq_reviews, X_test_seq_reviews = padding_sequence("reviews", X_train_reviews, X_test_reviews, y_train, y_test, maxlen_reviews)
print("Movie Reviews padded sequences created\n")

X_train_seq_title, X_test_seq_title = padding_sequence("movie title", X_train_title, X_test_title, y_train, y_test, maxlen_title)
print("Movie Title padded sequences created")

#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#### #1st case of data: 80-20 split and non-balanced dataset!

* X_train & X_test with <b>80-20</b> split and <b>non-balanced genre</b> tags

In [None]:
print("X_train_seq_actors shape:{}".format(X_train_seq_actors.shape)) #80-20 split
print("X_train_seq_plot shape:{}".format(X_train_seq_plot.shape)) #80-20 split
print("X_train_seq_features shape:{}".format(X_train_seq_features.shape)) #80-20 split
print("X_train_seq_reviews shape:{}".format(X_train_seq_reviews.shape)) #80-20 split
print("X_train_seq_title shape:{}\n".format(X_train_seq_title.shape)) #80-20 split


print("X_test_seq_actors shape:{}".format(X_test_seq_actors.shape)) #80-20 split
print("X_test_seq_plot shape:{}".format(X_test_seq_plot.shape)) #80-20 split
print("X_test_seq_features shape:{}".format(X_test_seq_features.shape)) #80-20 split
print("X_test_seq_reviews shape:{}".format(X_test_seq_reviews.shape)) #80-20 split
print("X_test_seq_title shape:{}".format(X_test_seq_title.shape)) #80-20 split

* y_train & y_test with <b>80-20</b> split and <b>non-balanced genre</b> tags

In [None]:
print("y_train shape:{}".format(y_train.shape)) #80-20 split
print("y_test shape:{}".format(y_test.shape)) #80-20 split

In [None]:
np.save(os.path.join(os.getcwd(), "80-20 split_non-balanced\\text_tokenization_padded_sequences_{0}\\x_train_seq_actors_80-20_non-balanced_{1}_{0}".format(version_data_control, str(actors_tokenized))), X_train_seq_actors)
np.save(os.path.join(os.getcwd(), "80-20 split_non-balanced\\text_tokenization_padded_sequences_{0}\\x_train_seq_plot_80-20_non-balanced_{1}_{0}".format(version_data_control, str(plot_words_tokenized))), X_train_seq_plot)
np.save(os.path.join(os.getcwd(), "80-20 split_non-balanced\\text_tokenization_padded_sequences_{0}\\x_train_seq_features_80-20_non-balanced_{1}_{0}".format(version_data_control, str(features_words_tokenized))), X_train_seq_features)
np.save(os.path.join(os.getcwd(), "80-20 split_non-balanced\\text_tokenization_padded_sequences_{0}\\x_train_seq_reviews_80-20_non-balanced_{1}_{0}".format(version_data_control, str(reviews_words_tokenized))), X_train_seq_reviews)
np.save(os.path.join(os.getcwd(), "80-20 split_non-balanced\\text_tokenization_padded_sequences_{0}\\x_train_seq_title_80-20_non-balanced_{1}_{0}".format(version_data_control, str(title_words_tokenized))), X_train_seq_title)

np.save(os.path.join(os.getcwd(), "80-20 split_non-balanced\\text_tokenization_padded_sequences_{0}\\x_test_seq_actors_80-20_non-balanced_{1}_{0}".format(version_data_control, str(actors_tokenized))), X_test_seq_actors)
np.save(os.path.join(os.getcwd(), "80-20 split_non-balanced\\text_tokenization_padded_sequences_{0}\\x_test_seq_plot_80-20_non-balanced_{1}_{0}".format(version_data_control, str(plot_words_tokenized))), X_test_seq_plot)
np.save(os.path.join(os.getcwd(), "80-20 split_non-balanced\\text_tokenization_padded_sequences_{0}\\x_test_seq_features_80-20_non-balanced_{1}_{0}".format(version_data_control, str(features_words_tokenized))), X_test_seq_features)
np.save(os.path.join(os.getcwd(), "80-20 split_non-balanced\\text_tokenization_padded_sequences_{0}\\x_test_seq_reviews_80-20_non-balanced_{1}_{0}".format(version_data_control, str(reviews_words_tokenized))), X_test_seq_reviews)
np.save(os.path.join(os.getcwd(), "80-20 split_non-balanced\\text_tokenization_padded_sequences_{0}\\x_test_seq_title_80-20_non-balanced_{1}_{0}".format(version_data_control, str(title_words_tokenized))), X_test_seq_title)

np.save(os.path.join(os.getcwd(), "80-20 split_non-balanced\\text_tokenization_padded_sequences_{0}\\y_train_80-20_non-balanced_{0}".format(version_data_control)), y_train)
np.save(os.path.join(os.getcwd(), "80-20 split_non-balanced\\text_tokenization_padded_sequences_{0}\\y_test_80-20_non-balanced_{0}".format(version_data_control)), y_test)

#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#### Pickle the "dataset_frequent_genres" with the added cleaned columns of actors, plot, features and reviews

#### Pickle the X_test dataset for use in part 3.2

In [None]:
dataset_frequent_genres.columns

In [None]:
dataset_frequent_genres.to_pickle(os.path.join(os.getcwd(), "pickled_data_per_part\\dataset_part_3.1_{0}.pkl".format(version_data_control)))

In [None]:
X_test.to_pickle(os.path.join(os.getcwd(), "80-20 split_non-balanced\\text_tokenization_padded_sequences_{0}\\x_test_{0}.pkl".format(version_data_control)))

#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#### THIS IS THE END OF PART 3.1 - Where tokenization, cleaning and balancing of the data took place.