## Part 3.1 - Data Tokenization & Sequence padding

Set the version data control parameter (to save the outputs of this notebook at their latest date)

In [22]:
version_data_control="22022021"
datasets_path="C://Users//spano//Desktop//nlp_github//datasets" #change this path based on your local folder

#### Import the libraries

In [2]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
import numpy as np
from tabulate import tabulate
import re
import os
import time
from humanfriendly import format_timespan
import random

# Module to serialize the content produced from the execution of the code
import joblib

# Module to monitor the progress of a python for loop
from tqdm import tqdm

# Module to manipulate text in python - NLTK package
import nltk
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer

import spacy
import unidecode
from word2number import w2n

# Module to compute word vectorizers and compute the cosine distance
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.feature_extraction import text

import string
import itertools

from scipy import stats

#Import matplotlib for data visualization
%matplotlib inline
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
from pylab import rcParams
from packaging import version

# from IPython.core.display import display,HTML
# display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
#!python -m spacy download en_core_web_md

#### Keras Tokenization and Plotting Libraries

In [3]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import MultiLabelBinarizer

#--------------------------------------------------------------
import tensorflow as tf
from tensorflow import keras

print("TensorFlow version: ", tf.__version__)
assert version.parse(tf.__version__).release[0] >= 2, "This notebook requires TensorFlow 2.0 or above."

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

TensorFlow version:  2.4.1
Version:  2.4.1
Eager mode:  True
GPU is available


In [4]:
from IPython.display import HTML
import random

def hide_toggle(for_next=False):
    this_cell = """$('div.cell.code_cell.rendered.selected')"""
    next_cell = this_cell + '.next()'

    toggle_text = 'Toggle show/hide'  # text shown on toggle link
    target_cell = this_cell  # target cell to control with toggle
    js_hide_current = ''  # bit of JS to permanently hide code in current cell (only when toggling next cell)

    if for_next:
        target_cell = next_cell
        toggle_text += ' next cell'
        js_hide_current = this_cell + '.find("div.input").hide();'

    js_f_name = 'code_toggle_{}'.format(str(random.randint(1,2**64)))

    html = """
        <script>
            function {f_name}() {{
                {cell_selector}.find('div.input').toggle();
            }}

            {js_hide_current}
        </script>

        <a href="javascript:{f_name}()">{toggle_text}</a>
    """.format(
        f_name=js_f_name,
        cell_selector=target_cell,
        js_hide_current=js_hide_current, 
        toggle_text=toggle_text
    )

    return HTML(html)

#### -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

In [5]:
"""Import the dataset corrected and enriched from part 2.1. This dataset will be tokenized and transformed in order to meet Tensorflow guidelines for NLP applications & research."""
dataset=joblib.load(f"{datasets_path}//dataset_part_3.1_22022021_light.pkl")

In [6]:
"""
Multi-hot encoding is a good practice to transform the value y into a data structure appropriate for multi-label text calssification.
Basically it creates a single column per genre with 0,1 binary values if the movie has the specific genre tag or not.
"""
mlb = MultiLabelBinarizer()
dataset_nlp_tokenization=dataset.join(pd.DataFrame(mlb.fit_transform(dataset["reduced_genres"]),
                                                                     columns=mlb.classes_,
                                                                     index=dataset.index))
dataset_nlp_tokenization.shape

(48834, 36)

In [7]:
"""Create the list of genres and serialize it. The genres of this list will be the dependent variable of the model predictor. The value the model classifier will try to predict."""
genres_list=dataset_nlp_tokenization["reduced_genres"].explode().value_counts(normalize=True).index.to_list()
genres_list.sort()
print(genres_list)
joblib.dump(genres_list,f"general_data_samples//genres_list_{version_data_control}.pkl")

['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']


['genres_list_22022021.pkl']

In [8]:
def clean_single_letter_actors(cast_list):
    """
    Purpose: We observed that a portion of actors had names consisting of one, two, or three words. We decided to remove such actors from the cast of each movie. 
    This action improved a lot the result of the data cleaning and tokenization.
    Arguments: Actors list per movie (row) of the dataset.
    Output: The actors list with short actor names deleted.
    """
    cleaned_actors=[actor for actor in cast_list if len(actor)>=4]
    return cleaned_actors
tqdm.pandas()
dataset_nlp_tokenization["actors_cleaned"]=dataset_nlp_tokenization["actors"].progress_apply(lambda x: clean_single_letter_actors(x))

100%|████████████████████████████████████████████████████████████████████████| 48834/48834 [00:00<00:00, 428153.21it/s]


In [9]:
"""Check the movies with actor names equal to length N, where N=any positive number. For example below, we check the movies with at leat one actor with length 4"""
mask=dataset_nlp_tokenization["actors_cleaned"].explode().str.len().eq(3)
res=dataset_nlp_tokenization[['title', 'actors_cleaned']].loc[np.unique(mask.loc[mask].index)]
res

Unnamed: 0,title,actors_cleaned


In [10]:
"""Create a function that will unify the actors into a single corpus text"""
def unify_actors(row):
    """
    Purpuse: Join the actor names into a single line
    Arguments: The row of the dataset
    Output: The list of actors per movie into a single line of text separated by commas. It's important to remember the comma separator because it will be used later in actors tokenization.
    """
    return ",".join(row["actors_cleaned"]).strip()
tqdm.pandas()
dataset_nlp_tokenization["actors_unified"]=dataset_nlp_tokenization.progress_apply(unify_actors, axis=1)

100%|█████████████████████████████████████████████████████████████████████████| 48834/48834 [00:00<00:00, 82905.52it/s]


In [99]:
"""
Functions used across the whole notebook.
Those functions are explisetely used to pre-process the raw data input of texts
"""
from text2digits import text2digits
# -------------------------------------------------------------------------------------
# Function 1
nlp=spacy.load('en_core_web_md')
def decontracted(phrase):
    """
    Purpose: Expand contractions and abbreviations in the text. Also replace the text format of a number(i.e million) with its equivallent numeric format (i.e 1000000)
    Argument: The text related to movie content such as Plot summary, Reviews
    Output: The corpus text cleaned off abbreviations and contractions
    """
    # General contractions
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r'(?<=[.,"])(?=[^\s])', "", phrase)
    phrase = phrase.replace("L. A.", "Los Angeles")\
                   .replace("U. S.", "United States")\
                   .replace("U. K.", "United Kindom")\
                   .replace("a.k.a", "")\
                   .replace("a. k. a", "")\
                   .replace("sci-fi", "science fiction")\
                   .replace("U-boat", "submarine")\
                   .replace("N-bomb", "nuclear bomb")\
                   .replace("S&amp;M", "")\
                   .replace("XIX-th", "19")\
                   .replace("Twice-orphaned", "two times orphaned")\
                   .replace("(wΔz = Cov (w,z) = βwzVz)", "")\
                   .replace("(wDz = Cov (w,z) = bwzVz)", "")
    return phrase
#----------------------------------------------------------------------------------------

# Function 2
def correct_abbreviations(phrase):
    # Word to numbers and correct abbreviations
    phrase=phrase.replace("multimillion", "multi million")\
                 .replace("multibillion", "multi billion")\
                 .replace("multi-million", "multi million")\
                 .replace("multi-billion", "multi billion")\
                 .replace("trillion", "1000000000000")\
                 .replace("billion", "1000000000")\
                 .replace("crore", "10000000")\
                 .replace("mln", "1000000")\
                 .replace("bln", "1000000000")\
                 .replace("III", "3")\
                 .replace("II", "2")\
                 .replace("iii", "3")\
                 .replace("world war ii", "world war 2")\
                 .replace("world war i", "world war 1")\
                 .replace("HEADER", "The movie")\
                 .replace("(die fetten jahre sind vorbei)", "")\
                 .replace("named V", "")\
                 .replace("thiry", "Thiry")\
                 .replace("Kirsten deLohr Helland", "Kirsten Helland")\
                 .replace("circa", "around")\
                 .replace("xXx", "")\
                 .replace("XXX", "")\
                 .replace('tomboy "M"', "tomboy")\
                 .replace("slew", "slayed")\
                 .replace("Crore","ten million")\
                 .replace("crore","ten million")\
                 .replace("twentyfive","25")\
                 .replace("FLicKeR", "the movie")\
                 .replace("IMAX", "Space Station 3D")
    return phrase
#----------------------------------------------------------------------------------------

# Function 3
def preprocess_movie_content(raw_text, process_column, debug_mode=None): #Movie Content aka column name: movie_features
    """
    Purpose: Clean the columns of movie content (plot summary, movie features and reviews) from any textual noise that could spoil the prediction classifier and result. Also apply lemmatization instead of stemming on each word.
             This step is from the most important steps of NLP applications because it gives a unique form and meaning in the words of the dataset. Upon those cleaned words the NLP algorithm will learn the schemantics
             that categorize a movie to a specific genre tag. Eight(8) steps are applied below. Each step is based on the researcher's intuition and are subjective to the experiment dataset. 
             Thus, it is advised to take parts from here with caution as they may not be applicable to other datasets.
    Arguments: The plot summary or review per movie and an indicator(process_column) whether or not the text is plot or review.
    Output: The cleaned plot summary, movie features and reviews.
    """
    # 1.Remove punctuation
    raw_text_decontracted=re.sub(" +", " ",unidecode.unidecode(decontracted(raw_text)).translate(str.maketrans(string.punctuation+"–", " "*len(string.punctuation+"–"))))
    if debug_mode==True:
        print("1",raw_text_decontracted)
    
    # 2.Expand Contractions and abbreviations
    raw_text_no_abbreviations=correct_abbreviations(re.sub(" +", " ",raw_text_decontracted))
    if debug_mode==True:
        print("\n2",raw_text_no_abbreviations)
    #------------------------------------------------
    
    # 3.Remove numbers
    # 3.1 Strip Dates
    stripped_date_string=re.sub(r'\w*\d\w*', "", raw_text_no_abbreviations).strip()
    
    # 3.2 From text to numeric form. Then delete the text in numeric form.
    non_numeric_instances=["N","m","V","IMAX","IndieFEST","TWICE","InAPPropriate","Twice","zucchinis"]
    doc=nlp(stripped_date_string)
    if debug_mode==True:
        print([(w.text, w.pos_) for w in doc if w.pos_=="NUM"]) #only for rnd when an error is generated
    if process_column=="process_plot":
        tokens=[w2n.word_to_num(token.text) if token.pos_=='NUM' and token.text not in non_numeric_instances else token for token in doc]
        stripped=[i.text if not str(i).isnumeric() else str(i) for i in tokens]
        tokens_white_space_stripped_again=[w.strip() for w in stripped]
    else:
        t2d=text2digits.Text2Digits()
        tokens=t2d.convert(stripped_date_string)
        tokens=tokens.split(' ')
        tokens_white_space_stripped_again=[w.strip() for w in tokens]
    stripped_no_numbers=[i for i in tokens_white_space_stripped_again if not i.isnumeric()]
    stripped_no_numbers=list(filter(None, stripped_no_numbers))
    if debug_mode==True:
        print("\n3",stripped_no_numbers)
    #------------------------------------------------
    
    # 4.Remove stop words
    stop_words=text.ENGLISH_STOP_WORDS.union(["book"])
    no_stopword_text=[word for word in stripped_no_numbers if not word.lower() in stop_words]
    no_stopword_text=' '.join(no_stopword_text) #join the text once more because a new lemmatizing approach is implemented below
    if debug_mode==True:
        print("\n4",no_stopword_text)
    #------------------------------------------------
    
    # 5.Lemmatization text in its lowercase() format
    lemmatizer=WordNetLemmatizer()
    lemmatized_text=list(set([lemmatizer.lemmatize(i.lower(),j[0].lower()) if j[0].lower() in ['a','n','v'] else lemmatizer.lemmatize(i.lower()) for i,j in pos_tag(word_tokenize(no_stopword_text))]))
    if debug_mode==True:
        print("\n5",lemmatized_text)
    #------------------------------------------------
    
    # 6.Join the words together to create the final text
    cleaned_text=' '.join(lemmatized_text)
    #------------------------------------------------
    
    return cleaned_text
#----------------------------------------------------------------------------------------

# Function 4
def transform_columns(column_name_to_clean, column_name_cleaned, dataset, process_column=None, debug_mode=None):
    """
    Purpose: Apply the previous function 'preprocess_movie_content()' on the columns with movie content. Also lower-case every actor and movie tilte in the dataset.
    Arguments: column_name_to_clean: Column in the dataset we desire to clean.
               column_name_cleaned: New column name with the cleaned rows of the column cleaned.
               dataset: Dataset to apply the cleaning.
               process_column: In which column to apply the function preprocess_movie_content().
               debug_mode: Whethere or not to print some specific statements to find erronious code lines if an exception is raised.
    Output: The cleaned column specified in the arguments. Every row of the five input columns with movie content is cleaned using this function. The five movie content columns are: Title, Actors, Plot, Features, Reviews.
    """
    tqdm.pandas()
    if column_name_to_clean=="actors_unified":
        dataset.loc[:, column_name_cleaned]=dataset.loc[:, column_name_to_clean].progress_apply(lambda x: x.lower())
    elif column_name_to_clean=="title":
        dataset.loc[:, column_name_cleaned]=dataset.loc[:, column_name_to_clean].progress_apply(lambda x: re.sub(" +", " ", x.translate(str.maketrans(string.punctuation+"–", " "*len(string.punctuation+"–"))).lower().strip()))
    else:
        dataset.loc[:, column_name_cleaned]=dataset.loc[:, column_name_to_clean].progress_apply(lambda x: preprocess_movie_content(x, process_column, debug_mode))
#----------------------------------------------------------------------------------------

# Function 5
def split_dataset(method, labels, dataset, split_ratio):
    """
    Purpose: Stratify split the dataset because of the imbalance of multi-label genre movies. We demonstrated in part 2 that genres were not equally distributed among the movies of the dataset. 
             This means that some genres are more frequent than others. In order to preserve the same frequency among the training, validation and test samples, it's imperative to apply stratified split.
             In case the imbalance in the dataset is fixed in later versions of the notebook, then a simple ramdom split will be applied. However, uptil now the dependent variable(y) has significant imbalance
             and thus stratification is applied.
    Arguments: method: Stratified or random split
               labels: Genre tags which represent the dependent variable y. Recall that the genre tags were extracted and serialized earlier in the notebook.
               dataset: Movies dataset with the dependent and independent variables.
               split_ratio: Percentage of rows for the training sample and 1-%training sample for the test sample.
    Output: Stratified or random splitted training and test samples.           
    """
    X=dataset[["title","clean_actors","clean_plot_summary","clean_combined_features","clean_reviews","clean_movie_title","reduced_genres"]]
    y=labels
    if method=="stratified":
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio, random_state=123, shuffle=True, stratify=y)
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio, random_state=123, shuffle=True)
    return X_train, X_test, y_train, y_test
#----------------------------------------------------------------------------------------

# Function 6
def keras_tokenization(variable, maximum_words, dataset, column_name_tokenized, x_train, x_test, x_validation, column_sequence):
    """
    Purpose: The keras tokenization method that will transform a sentence of text to a sequence of tokens, mapping each token to an index. An Out-of-Vocabulary token is also created, to map words not having an integer index.
             Data tokenization is a fundamental step of NLP applications, because mathematical algorithms cannot understand text content but only numbers. Thus, every text in the dataset should be represented by a single token.
    Arguments: variable: Dependent column to tokenize. At the moment five are the dependent predictors (actors, plot, featurs, reviews, title)
               maximum_words: Words to tokenize per column sequence.
               dataset: Movies dataset with movie content.
               column_name_tokenized: Column of which the rows will be tokenized.The columns the function is applied are the five content inputs.
               x_train: Training sample
               x_test: Test sample
               column_sequence: New column name in x_train, x_test with the tokenized rows per input.
    Outputs: vocabulary_sized_frequent_words: Number of words tokenized out of the total corpus of a column. In this experiment we tokized the 95% of the total words per column.
             tokenizer: Tensorflow tokenizer that tokenized an input column. Note that as input we refer to the columns used in the Input layer of an NLP classifier (aka dependent variables).
    """
    #The tokenizer class has some main buggs when the word index mapping is created. So the function is assembled based on this GitHub post https://github.com/keras-team/keras/issues/8092#issuecomment-372833486
    if variable=="actors":
        tokenizer=Tokenizer(num_words=maximum_words,  filters=",", lower=True, split=",", oov_token='<OOV>')
    else:
        tokenizer=Tokenizer(num_words=maximum_words, filters=" ", lower=True, split=" ", oov_token='<OOV>')

    tokenizer.fit_on_texts(list(dataset.loc[:, column_name_tokenized]))
    unique_tokens=len(tokenizer.word_index)
    print(f"Maximum length of unique tokens is: {unique_tokens}")

    words_to_tokenize=int(round(unique_tokens*0.95,0)) #-1 because OOV token is include. While in the Count Vectorizer the OOV token is not included in the 95% so we should substract one index from the total length
    print(f"Number of words to be tokenized is the 95% of those unique tokens, equal to: {words_to_tokenize}\nThe rest 5% or {unique_tokens-words_to_tokenize} is not tokenized.")
    
    if maximum_words==words_to_tokenize:
        print("\nKeras Tokenizer result is equal to Count Vectorizer result!")
    else:
        print("\nKeras Tokenizer result is not equal to Count Vectorizer result!")
    tokenizer.word_index={e:i for e,i in tokenizer.word_index.items() if i <= words_to_tokenize}
    tokenizer.word_index={x.strip(): v for x, v in tokenizer.word_index.items()}
    tokenizer.word_index[tokenizer.oov_token]=words_to_tokenize+1 #this assignes the last index to the OOV token
    print(f"Number of words mapped: {words_to_tokenize-1}. The extra 1 index represents the OVV token, which is not included in the CountVectorizer.")

    x_train.loc[:, column_sequence]=tokenizer.texts_to_sequences(x_train.loc[:, column_name_tokenized])
    x_test.loc[:, column_sequence]=tokenizer.texts_to_sequences(x_test.loc[:, column_name_tokenized])
    x_validation.loc[:, column_sequence]=tokenizer.texts_to_sequences(x_validation.loc[:, column_name_tokenized])

    vocabulary_size_frequent_words=len(tokenizer.word_index)

    try:
        assert words_to_tokenize==maximum_words #+1 because we added the OOV token in the last index
    except AssertionError:
        print(f"ERROR: The length of the vocabulary is not equal to the number of word_index dictionary, but rather is equal to: {len(tokenizer.word_index)}\nCorrect length: {maximum_words}.")

    return vocabulary_size_frequent_words, tokenizer
#----------------------------------------------------------------------------------------

# Function 7
def mean(numbers):
    """
    Purpose: Get the maximum length per sequence. Note here that every sequence of indexes that represents a plot summary, a review, a title, a cast or movie features should be of equal length with their peers.
             For example, we know that plot summaries have different lengths. Some they have 25 words, others 10 or less. This is not acceptable by tensorflow algorithms.
             So every sequence of plot summary of all the movies should have the same sequence length.
    """
    return int(np.ceil(float(sum(numbers)) / max(len(numbers), 1)))
#----------------------------------------------------------------------------------------

# Function 8
def padding_sequence_length(column_sequences, percentile, x_train, x_test, x_validation):
    """
    Purpose: Find the maximum length of the sequences belonging to a column. The maximum length of the column's sequence is equal to the 95% length of all the sequences. So for example if the dataset has 10,000 sequences and 9,500
    of them have length 20 then all the sequences will either be cropped or extended to 20 integers. Then use the maximum_length per column upon every sequence of that column. The result is a batch of sequences with same length.
    Arguments: variable: Dependent variable to apply the function. Recall that since five(5) are the inputs of movie content, five are also the dependent variables.
               x_train: Training sample
               x_test: Test sample
               Each sample has five columns. One column per predictor.
    Output: The maximum length per input column.
    """
    all_train_lengths=list(x_train[column_sequences].apply(len))
    all_test_lengths=list(x_test[column_sequences].apply(len))
    all_validation_lenghts=list(x_validation[column_sequences].apply(len))
    
    maxlen_train=int(np.percentile(all_train_lengths, q=percentile)) #all samples (train, test, validation) should have the same length in sequences.
    maxlen_test=int(np.percentile(all_test_lengths, q=percentile))
    maxlen_validation=int(np.percentile(all_validation_lenghts, q=percentile))
    
    if maxlen_train!=maxlen_test!=maxlen_validation:
        maxlen_value=mean([maxlen_train, maxlen_test, maxlen_validation]) #maxlen per sequence will be euqal to the mean value between two maxlen value per sample (train, test).
    else:
        maxlen_value=maxlen_train

    print(f"Max Length of the pad sequence for {column_sequences}: {maxlen_value}")

    return maxlen_value
#----------------------------------------------------------------------------------------

# Function 9
# The input data for a deep learning model must be a single tensor (of shape e.g. (batch_size, 6, vocab_size), samples that are shorter than the longest item need to be padded with some placeholder value.
# For more check here: https://www.tensorflow.org/guide/keras/masking_and_padding
def padding_sequence(column_sequence, x_train, x_test, x_validation, y_train, y_test, y_validation, maxlen):
    """
    Purpose: Apply padding based on the maximum length per input column.
    Arguments: column_sequence: Name of the column to apply the padding.
               x_train: Training sample with the dependent variables (actors, plot, features, reviews, title).
               x_test: Test sample with the dependent variables.
               y_train: Training sample with the independent variable (genre tags).
               y_test: Test sample with the independent variable (genre tags).
               maxlen: Maximum length to be applied in all the sequences of the column. The maximum length is the output of the previous function.
    Output: Training and test sequence per dependent predictor.
    """
    x_train_seq=pad_sequences(x_train.loc[:, column_sequence], padding='post', maxlen=maxlen)
    x_test_seq=pad_sequences(x_test.loc[:, column_sequence], padding='post', maxlen=maxlen)
    x_validation_seq=pad_sequences(x_validation.loc[:, column_sequence], padding='post', maxlen=maxlen)
    
    assert len(x_train_seq)==len(y_train)
    assert len(x_test_seq)==len(y_test)
    assert len(x_validation_seq)==len(y_validation)
    
    return x_train_seq, x_test_seq, x_validation_seq

In [12]:
"""
Previously we experinced an error using the stratified sampling. Below we printed the number of genre sequences that are assigned to only one movie.
For those 131 movies the stratified sampling is failing to complete. Because those 131 movies have a unique sequence of genre assigned.
Thus, we should find their indexes and remove them. The final dataset should contain 48834-131=48703
"""
list_of_movies_to_remove=None
calculated_rows=len(dataset_nlp_tokenization["reduced_genres"].apply(tuple).value_counts()[dataset_nlp_tokenization["reduced_genres"].apply(tuple).value_counts()==1])
print(f"Number of movies that are assigned to only 1 sequence of genres: {calculated_rows}")
list_of_movies_to_remove=dataset_nlp_tokenization["reduced_genres"].apply(tuple).value_counts()[dataset_nlp_tokenization["reduced_genres"].apply(tuple).value_counts()==1].index.tolist()
list_of_movies_to_remove=[list(x) for x in list_of_movies_to_remove]
assert type(list_of_movies_to_remove[0]) is list

Number of movies that are assigned to only 1 sequence of genres: 131


In [13]:
"""
Below are the indexes of rows that should be removed from the dataset. In total 131 indexes.
With those final 48991 rows of the dataset, the stratified sampling will be successfully completed.
"""
indexes_to_remove=dataset_nlp_tokenization['reduced_genres'].map(lambda x: 1 if list(x) in list_of_movies_to_remove else 0)[dataset_nlp_tokenization['reduced_genres'].map(lambda x: 1 if x in list_of_movies_to_remove else 0)==1].index.tolist()
dataset_nlp_tokenization=dataset_nlp_tokenization[~dataset_nlp_tokenization.index.isin(indexes_to_remove)]
dataset_nlp_tokenization=dataset_nlp_tokenization.reset_index(drop=True)
dataset_nlp_tokenization.shape

(48703, 38)

In [14]:
"""Serialize a list of movie's year of release. This list will be used extensively in Part 5"""
year_list=dataset_nlp_tokenization['year'].values.tolist()
with open(f"general_data_samples//year_list_{version_data_control}.pkl","wb") as f:
    joblib.dump(year_list, f)

In [15]:
"""Serialize a list of the movie titles. This list will be used in many of the notebooks to come."""
movie_title_list=dataset_nlp_tokenization['title'].values.tolist()
with open(f"general_data_samples//movie_title_list_{version_data_control}.pkl","wb") as f:
    joblib.dump(movie_title_list, f)

#---------------------------------------------------------------------------------------------------
#### R&D section for testing the function 4 - used for demo

In [None]:
# dataset_rnd=dataset_nlp_tokenization.iloc[0:].copy()
# dataset_rnd=dataset_rnd.reset_index(drop=True)

#transform_columns("plot_overview", "clean_plot_summary", dataset_rnd.iloc[46280:46281], "process_plot", True)

#### R&D section for testing the function 4 - used for demo
#---------------------------------------------------------------------------------------------------

In [16]:
"""
Apply the function 3. The function executed will extensively clean the corpus text per input variable by calling functions 1 and 2.
"""
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Transfrom the columns:
# -> Actors
# -> Plot summary
# -> Movie Features
# -> Reviews
# -> Movie Title

print("Start Execution")
begin_time=time.time()
print("---------------------------------------------------------------------------------\n")
print("Transfrom the column of the actors")
start_time_one=time.time()
transform_columns("actors_unified", "clean_actors", dataset_nlp_tokenization, "process_actors", False)
print(f"Finished the actors transformation after: {format_timespan(time.time()-start_time_one)}\n")

print("Transfrom the column of the plot summary")
start_time_two=time.time()
transform_columns("plot_overview", "clean_plot_summary", dataset_nlp_tokenization, "process_plot", False)
print(f"Finished the plot transformation after: {format_timespan(time.time()-start_time_two)}\n")

print("Transfrom the column of the movie reviews")
start_time_four=time.time()
transform_columns("reviews_enriched", "clean_reviews", dataset_nlp_tokenization, "process_reviews", False)
print(f"Finished the reviews_pruned transformation after: {format_timespan(time.time()-start_time_four)}\n")

print("Transfrom the column of the movie title")
start_time_five=time.time()
transform_columns("title", "clean_movie_title", dataset_nlp_tokenization, "process_title", False)
print(f"Finished the movie title transformation after: {format_timespan(time.time()-start_time_five)}")

print("---------------------------------------------------------------------------------\n")
print(f"Finished Execution after: {format_timespan(time.time()-begin_time)}")

100%|████████████████████████████████████████████████████████████████████████| 48703/48703 [00:00<00:00, 666733.65it/s]
  0%|                                                                                        | 0/48703 [00:00<?, ?it/s]

Start Execution
---------------------------------------------------------------------------------

Transfrom the column of the actors
Finished the actors transformation after: 0.08 seconds

Transfrom the column of the plot summary


100%|████████████████████████████████████████████████████████████████████████████| 48703/48703 [09:55<00:00, 81.82it/s]
  0%|                                                                              | 2/48703 [00:00<1:04:07, 12.66it/s]

Finished the plot transformation after: 9 minutes and 55.31 seconds

Transfrom the column of the movie reviews


100%|██████████████████████████████████████████████████████████████████████████| 48703/48703 [1:35:20<00:00,  8.51it/s]
 40%|████████████████████████████▊                                           | 19465/48703 [00:00<00:00, 192726.28it/s]

Finished the reviews_pruned transformation after: 1 hour, 35 minutes and 20.49 seconds

Transfrom the column of the movie title


100%|████████████████████████████████████████████████████████████████████████| 48703/48703 [00:00<00:00, 191746.47it/s]


Finished the movie title transformation after: 0.3 seconds
---------------------------------------------------------------------------------

Finished Execution after: 1 hour, 45 minutes and 16.19 seconds


In [17]:
"""Check how many actors from the cleaned actors column, have a name equal to string length 1"""
mask=dataset_nlp_tokenization["clean_actors"].str.split(",").explode().str.len().eq(1)
res=dataset_nlp_tokenization[["title", "clean_actors"]].loc[np.unique(mask.loc[mask].index)]
display(res)

Unnamed: 0,title,clean_actors


In [18]:
"""Create the column Combined Features again with the plot_overview column"""
def combine_features(row):
    """
    Purpose: This function will create the column 'movie_features'. The column will combine text content from multiple columns of the dataframe.
             Specifically the text combined derives from the following columns:
             1) Title,
             2) Actors,
             3) Director,
             4) Plot summary,
             5) Genres
    Argument: Dataset row
    Output: The unified text of five columns into one column.
    """
    return row["clean_movie_title"].lower() + " " + " ".join(row["clean_actors"].split(",")).lower() + " " + row["director"].lower() + " " + row["clean_plot_summary"].lower() + " " + " ".join(row["reduced_genres"]).lower()

dataset_nlp_tokenization["clean_combined_features"]=dataset_nlp_tokenization.apply(combine_features, axis=1)
dataset_nlp_tokenization["clean_combined_features"].iloc[0]

'toy story tom hanks tim allen don rickles jim varney wallace shawn john ratzenberger annie potts john morris erik von detten laurie metcalf r. lee ermey sarah freeman penn jillette jack angel spencer aste john lasseter led difference room heart happily buzz afraid eventually bring toy andy aside owner losing plot lightyear learn circumstance live scene separate place woody birthday duo adventure animation children'

In [19]:
"""
Before pre-processing the raw text of the first review about Toy Story. Text has been many stop words, punctuations and words in many different tense!
"""
dataset_nlp_tokenization['reviews_enriched'].iloc[0]

'Andy\'s toys live a reasonable life of fun and peace, their only worries are birthdays and Christmases, when new toys could easily replace those already there. One such birthday Andy\'s top toy, Woody the cowboy, finds himself in direct competition with Andy\'s new Buzz Lightyear doll. When rivalries boil over Woody tries to hide Buzz down the side of the bed but accidentally pushes him out the window, the other tops expel Woody, and he leaves with no choice but to find Buzz and return him to the house. But with only two days before Andy moves house, time is of the essence. Given how often the same mix of animation, wit, jokes and kids humour has been used since Toy Story (Ice Age, Monsters Inc, Bugs Life) it is easy to forget how refreshing it was when it first came out. I have just watched it again and it is dating a little in comparison to more recent twists on the formula. It seems each one has to be sharper and have more references etc in the background. However it is still very 

In [20]:
"""
After pre-processing the raw text of the first review about Toy Story. Text has been lemmatized and cleaned off most of the noise!
"""
dataset_nlp_tokenization['clean_combined_features'].iloc[0]

'toy story tom hanks tim allen don rickles jim varney wallace shawn john ratzenberger annie potts john morris erik von detten laurie metcalf r. lee ermey sarah freeman penn jillette jack angel spencer aste john lasseter led difference room heart happily buzz afraid eventually bring toy andy aside owner losing plot lightyear learn circumstance live scene separate place woody birthday duo adventure animation children'

In [24]:
"""
Having cleaned the dataset appropriately it is imperative to serialize and save a version of it.
This action will save us approximately 2 hours of re-applying again the cleaning functions
"""
joblib.dump(dataset_nlp_tokenization,f"{datasets_path}//dataset_part_3.1_inputs_cleaned_{version_data_control}.pkl")

['C://Users//spano//Desktop//nlp_github//datasets//dataset_part_3.1_inputs_cleaned_22022021.pkl']

#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Option 1: Stratified Shuffle Split using the train_test_split function (train, test)

In [25]:
"""
Firstly, apply stratified shuffle split in the dataset to split it into training and test samples.
Secondly, apply stratified shuffle split in the training sample to split it into validation and training sample.

Split ratio of the first split: 80% training - 20% sample
Split ratio of the second split: 80% training - 20% sample
For imbalanced datasets and specifically for classification models, the stratification comes in handy because it ensures that the data will be splitted uniformly in both the train, test and validation samples.
"""
# Split the dataset into train & set sets
print("\n---------------------------------------------------------------------------------")
print("\nSplit the dataset into train & test sets (stratified shuffle split)\n")
start_time=time.time()
X_train, X_test, y_train, y_test=split_dataset("stratify", dataset_nlp_tokenization.iloc[:, 19:36], dataset_nlp_tokenization, 0.2)
assert X_train.shape[0]==y_train.shape[0]
assert X_test.shape[0]==y_test.shape[0]
print("Finished train-test split after: {0}".format(format_timespan(time.time()-start_time)))

print("\n---------------------------------------------------------------------------------")
print("\nSplit the dataset into train & validation sets (stratified shuffle split)\n")
start_time=time.time()
X_train, X_validation, y_train, y_validation=split_dataset("stratify", y_train, X_train, 0.2)
assert X_train.shape[0]==y_train.shape[0]
assert X_validation.shape[0]==y_validation.shape[0]

assert X_train.shape[0]+X_validation.shape[0]+X_test.shape[0]==dataset_nlp_tokenization.shape[0]
assert y_train.shape[0]+y_validation.shape[0]+y_test.shape[0]==dataset_nlp_tokenization.shape[0]

print("Finished train-test split after: {0}".format(format_timespan(time.time()-start_time)))


---------------------------------------------------------------------------------

Split the dataset into train & test sets (stratified shuffle split)

Finished train-test split after: 0.09 seconds

---------------------------------------------------------------------------------

Split the dataset into train & validation sets (stratified shuffle split)

Finished train-test split after: 0.03 seconds


In [26]:
"""
The shape of the X_train, X_test, y_train, y_test splitted and shuffled randomly
"""
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"X_validation shape: {X_validation.shape}\n")

print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")
print(f"y_validation shape: {y_validation.shape}")

X_train shape: (31169, 7)
X_test shape: (9741, 7)
X_validation shape: (7793, 7)

y_train shape: (31169, 17)
y_test shape: (9741, 17)
y_validation shape: (7793, 17)


In [27]:
round(dataset_nlp_tokenization["reduced_genres"].explode().value_counts(normalize=True)*100,3)

Drama          25.372
Comedy         15.956
Action          7.580
Romance         7.222
Thriller        7.024
Horror          5.734
Crime           5.423
Documentary     4.316
Adventure       4.281
Sci-Fi          3.010
Mystery         2.721
Children        2.520
Animation       2.365
Fantasy         2.288
War             1.685
Western         1.355
Musical         1.145
Name: reduced_genres, dtype: float64

In [28]:
"""
The stratification worked!
"""
print(round(X_train["reduced_genres"].explode().value_counts(normalize=True)*100,3),"\n")
print(round(X_test["reduced_genres"].explode().value_counts(normalize=True)*100,3),"\n")
print(round(X_validation["reduced_genres"].explode().value_counts(normalize=True)*100,3),"\n")

Drama          25.404
Comedy         15.997
Action          7.539
Romance         7.243
Thriller        7.041
Horror          5.708
Crime           5.438
Documentary     4.308
Adventure       4.282
Sci-Fi          3.012
Mystery         2.800
Children        2.480
Fantasy         2.298
Animation       2.287
War             1.670
Western         1.361
Musical         1.133
Name: reduced_genres, dtype: float64 

Drama          25.155
Comedy         16.098
Action          7.711
Romance         7.176
Thriller        6.990
Horror          5.756
Crime           5.424
Documentary     4.562
Adventure       4.157
Sci-Fi          3.013
Mystery         2.704
Children        2.484
Animation       2.473
Fantasy         2.140
War             1.707
Western         1.295
Musical         1.155
Name: reduced_genres, dtype: float64 

Drama          25.514
Comedy         15.615
Action          7.584
Romance         7.199
Thriller        6.996
Horror          5.814
Crime           5.366
Adventure       4.43

In [29]:
"""
The below cell serialises the training, validation and test samples created by the stratified split.
"""
joblib.dump(X_train,os.path.join(os.getcwd(), f"train_test_validation_all_inputs//X_train_all_inputs_{version_data_control}.pkl"))
joblib.dump(X_test,os.path.join(os.getcwd(), f"train_test_validation_all_inputs//X_test_all_inputs_{version_data_control}.pkl"))
joblib.dump(X_validation,os.path.join(os.getcwd(), f"train_test_validation_all_inputs//X_validation_all_inputs_{version_data_control}.pkl"))

joblib.dump(y_train,os.path.join(os.getcwd(), f"train_test_validation_all_inputs//y_train_all_inputs_{version_data_control}.pkl"))
joblib.dump(y_test,os.path.join(os.getcwd(), f"train_test_validation_all_inputs//y_test_all_inputs_{version_data_control}.pkl"))
joblib.dump(y_validation,os.path.join(os.getcwd(), f"train_test_validation_all_inputs//y_validation_all_inputs_{version_data_control}.pkl"))

['C:\\Users\\spano\\Desktop\\nlp_github\\NLP_Applications\\multi-label-text-classification-on-movies\\train_test_validation_all_inputs//y_validation_all_inputs_22022021.pkl']

#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

In [30]:
"""
Read from the local disk the serialized training, validation and test samples.
"""
X_train=joblib.load(os.path.join(os.getcwd(), f"train_test_validation_all_inputs//X_train_all_inputs_{version_data_control}.pkl"))
X_test=joblib.load(os.path.join(os.getcwd(), f"train_test_validation_all_inputs//X_test_all_inputs_{version_data_control}.pkl"))
X_validation=joblib.load(os.path.join(os.getcwd(), f"train_test_validation_all_inputs//X_validation_all_inputs_{version_data_control}.pkl"))

y_train=joblib.load(os.path.join(os.getcwd(), f"train_test_validation_all_inputs//y_train_all_inputs_{version_data_control}.pkl"))
y_test=joblib.load(os.path.join(os.getcwd(), f"train_test_validation_all_inputs//y_test_all_inputs_{version_data_control}.pkl"))
y_validation=joblib.load(os.path.join(os.getcwd(), f"train_test_validation_all_inputs//y_validation_all_inputs_{version_data_control}.pkl"))

In [31]:
"""
Now that the data is splitted, we separated each column of interest to a different X_train and X_test
Those train and text X sets will be later used for tokenization and padding
"""
# Separate each different input column (actors, plot, features, reviews, title)

X_train_actors=X_train[["title", "clean_actors", "reduced_genres"]]
X_train_plot=X_train[["title", "clean_plot_summary", "reduced_genres"]]
X_train_features=X_train[["title", "clean_combined_features", "reduced_genres"]]
X_train_reviews=X_train[["title", "clean_reviews", "reduced_genres"]]
X_train_title=X_train[["title", "clean_movie_title", "reduced_genres"]]
assert X_train_actors.shape==X_train_plot.shape==X_train_features.shape==X_train_reviews.shape==X_train_title.shape

X_test_actors=X_test[["title", "clean_actors", "reduced_genres"]]
X_test_plot=X_test[["title", "clean_plot_summary", "reduced_genres"]]
X_test_features=X_test[["title", "clean_combined_features", "reduced_genres"]]
X_test_reviews=X_test[["title", "clean_reviews", "reduced_genres"]]
X_test_title=X_test[["title", "clean_movie_title", "reduced_genres"]]
assert X_test_actors.shape==X_test_plot.shape==X_test_features.shape==X_test_reviews.shape==X_test_title.shape

X_validation_actors=X_validation[["title", "clean_actors", "reduced_genres"]]
X_validation_plot=X_validation[["title", "clean_plot_summary", "reduced_genres"]]
X_validation_features=X_validation[["title", "clean_combined_features", "reduced_genres"]]
X_validation_reviews=X_validation[["title", "clean_reviews", "reduced_genres"]]
X_validation_title=X_validation[["title", "clean_movie_title", "reduced_genres"]]
assert X_validation_actors.shape==\
       X_validation_plot.shape==\
       X_validation_features.shape==\
       X_validation_reviews.shape==\
       X_validation_title.shape

In [32]:
# Now the movies do not contain actors with names of single, 2, or 3 letters
mask=X_validation_actors.clean_actors.str.split(",").explode().str.len().eq(5)
res=X_validation_actors[['title', 'clean_actors']].loc[np.unique(mask.loc[mask].index)]
display(res)

Unnamed: 0,title,clean_actors
28,The City of Lost Children,"ron perlman,daniel emilfork,judith vittet,dominique pinon,jean-claude dreyfus,geneviève brunet,odile mallet,mireille mossé,serge merlin,rufus,ticky holgado,joseph lucien,mapi galán,briac barthélémy,pierre-quentin faesch"
97,Heidi Fleiss: Hollywood Madam,"nick broomfield,nina xining zuo,madam alex,corinne bohrer,mike brambles,cookie,elisa fleiss,heidi fleiss,jason fleiss,jesse fleiss,kim fleiss,paul fleiss,shannon fleiss,gabby,daryl gates"
716,The Arrival,"charlie sheen,lindsay crouse,richard schiff,shane,ron silver,teri polo,phyllis applegate,alan coates,leon rippy,buddy joe hooker,javier morga,tony t. johnson,catalina botello,georg lillitsch,david villalpando"
890,Lost Horizon,"ronald colman,jane wyatt,edward everett horton,john howard,thomas mitchell,margo,isabel jewell,h.b. warner,sam jaffe"
1093,Delicatessen,"pascal benezech,dominique pinon,marie-laure dougnac,jean-claude dreyfus,karin viard,ticky holgado,anne-marie pisani,boban janevski,mikael todde,edith ker,rufus,jacques mathou,howard vernon,chick ortega,silvie laguna"
...,...,...
47496,Iracema,"paulo césar peréio,edna de cássia,lúcio dos santos,elma martins,natal,fernando neves,wilmar nunes,sidney piñon,rose rodrigues,conceição senna"
47646,Moonlight on the Prairie,"dick foran,smoke,sheila bromley,george e. stone,joe sawyer,joe king,robert barrat,dickie jones,bill elliott,herbert heywood,raymond brown,richard carle,milton kibbee"
48468,Krishnanum Radhayum,"souparnika,rupa jith,devika,ajit,ajayan,hanifa,prathyush,navajyot pandit,varsha pandit,sherij,liji,navindran,sunil,santhosh pandit"
48485,Uzhaippali,"s.s. chandran,charlie,goundamani,kavitha,mayilsamy,prathapachandran,rajinikanth,nizhalgal ravi,radha ravi,roja,srividya,sujatha,vijayakumar,visu,vivek"


### -------------------------------------------------------------------------------------------------------------------------------------
Actors-CountVectorizer

In [33]:
%%time
"""
Find the most frequent words among the actor names
In the end the number of rows will be equal to the number of maximum features tokenized by the Tokenizer.
-> Probably the infrequent tokens will make a better classification
"""
def actors_split(s):
    return s.split(',')

corpus_actors=dataset_nlp_tokenization['clean_actors'].values.tolist()
c_vectorizer=CountVectorizer(encoding='utf-8', tokenizer=actors_split, max_df=1.0, min_df=1) #keep this to 1 to include all the words/tokens
c_vectorizer.fit(corpus_actors)
print(f"Vocabulary length of CountVectorizer of the actors corpus: {len(c_vectorizer.vocabulary_)}")

X=c_vectorizer.fit_transform(corpus_actors)
X_words=c_vectorizer.inverse_transform(X)

tokens_list=c_vectorizer.get_feature_names()
count_list = np.asarray(X.sum(axis=0)).ravel().tolist()

token_frequency_df=pd.DataFrame({'term': c_vectorizer.get_feature_names(), 'token_frequency': count_list})
token_frequency_df=token_frequency_df.sort_values(by='token_frequency', ascending=False)
print(token_frequency_df.shape)
#token_frequency_df_pruned_actors=token_frequency_df[token_frequency_df['token_frequency']>=3]
#print(token_frequency_df_pruned_actors.shape)

# The below code sample creates a dictionary that shows only the n-frequent actors
actors_frequency_dictionary=dict(zip(tokens_list, count_list))
d_actors = dict((k, v) for k, v in actors_frequency_dictionary.items() if v >= 80) # v = popularity (frequency) or number of movies played
# sorted(d_actors.items(), key=lambda kv: kv[1], reverse=True)



Vocabulary length of CountVectorizer of the actors corpus: 260913
(260913, 2)
Wall time: 8.78 s


In [34]:
percent_tokenized=0.95
print(f"The total number of actors that exist in the dataset is: {len(c_vectorizer.vocabulary_)}")
countvectorizer_actors_tokenized=int(len(c_vectorizer.vocabulary_)*percent_tokenized)
print(f"The 95% ({countvectorizer_actors_tokenized}) of the actors will be tokenized and the rest 5% ({len(c_vectorizer.vocabulary_)-countvectorizer_actors_tokenized}) of the actors will be removed due to sparsity")

The total number of actors that exist in the dataset is: 260913
The 95% (247867) of the actors will be tokenized and the rest 5% (13046) of the actors will be removed due to sparsity


### -------------------------------------------------------------------------------------------------------------------------------------
Plot summary-CountVectorizer

In [35]:
"""
Find the most frequent words among the movie plots
In the end the number of rows will be equal to the number of maximum features tokenized by the Tokenizer.
"""
def plot_split(s):
    return s.split(' ')

corpus_plot=dataset_nlp_tokenization['clean_plot_summary'].values.tolist()
c_vectorizer=CountVectorizer(encoding='utf-8', tokenizer=plot_split, max_df=1.0, min_df=1) #keep this to 1 to include all the words/tokens

c_vectorizer.fit(corpus_plot)
print(f"Vocabulary length of CountVectorizer of the plot corpus: {len(c_vectorizer.vocabulary_)}")

X=c_vectorizer.fit_transform(corpus_plot)
X_words=c_vectorizer.inverse_transform(X)

tokens_list=c_vectorizer.get_feature_names()
count_list = np.asarray(X.sum(axis=0)).ravel().tolist()

token_frequency_df_plot=pd.DataFrame({'term': c_vectorizer.get_feature_names(), 'token_frequency': count_list})
token_frequency_df_plot.reset_index(drop=True)
token_frequency_df_plot=token_frequency_df_plot.sort_values(by='token_frequency', ascending=False)
print(token_frequency_df_plot.shape)
#token_frequency_df_pruned_plot=token_frequency_df[token_frequency_df['token_frequency']>=3]
#print(token_frequency_df_pruned_plot.shape)

# The below code sample creates a dictionary that shows only the n-frequent actors
plot_frequency_dictionary=dict(zip(tokens_list, count_list))
d_plot = dict((k, v) for k, v in plot_frequency_dictionary.items() if v >= 1)



Vocabulary length of CountVectorizer of the plot corpus: 63033
(63033, 2)
Wall time: 7.4 s


In [36]:
percent_tokenized=0.95
print(f"The total number of plot tokens that exist in the dataset is: {len(c_vectorizer.vocabulary_)}")
countvectorizer_plot_words_tokenized=int(len(c_vectorizer.vocabulary_)*percent_tokenized)
print(f"The 95% ({countvectorizer_plot_words_tokenized}) of the plot summary tokens will be tokenized and the rest 5% ({len(c_vectorizer.vocabulary_)-countvectorizer_plot_words_tokenized}) of the plot words will be removed due to sparsity")

The total number of plot tokens that exist in the dataset is: 63033
The 95% (59881) of the plot summary tokens will be tokenized and the rest 5% (3152) of the plot words will be removed due to sparsity


### -------------------------------------------------------------------------------------------------------------------------------------
Features-CountVectorizer

In [37]:
%%time
"""
Find the most frequent words among the movie features
In the end the number of rows will be equal to the number of maximum features tokenized by the Tokenizer.
"""
def movie_features_split(s):
    return s.split(' ')
### -------------------------------------------------------------------------------------------------------------------------------------
Actors-CountVectorizer
corpus_features=dataset_nlp_tokenization['clean_combined_features'].values.tolist()
c_vectorizer=CountVectorizer(encoding='utf-8', tokenizer=movie_features_split, max_df=1.0, min_df=1) #keep this to 1 to include all the words/tokens

c_vectorizer.fit(corpus_features)
print(f"Vocabulary length of CountVectorizer of the features corpus: {len(c_vectorizer.vocabulary_)}")

X=c_vectorizer.fit_transform(corpus_features)
X_words=c_vectorizer.inverse_transform(X)

tokens_list=c_vectorizer.get_feature_names()
count_list = np.asarray(X.sum(axis=0)).ravel().tolist()

token_frequency_df=pd.DataFrame({'term': c_vectorizer.get_feature_names(), 'token_frequency': count_list})
token_frequency_df=token_frequency_df.sort_values(by='token_frequency', ascending=False)
print(token_frequency_df.shape)
#token_frequency_df_pruned_features=token_frequency_df[token_frequency_df['token_frequency']>=3]
#print(token_frequency_df_pruned_features.shape)

# The below code sample creates a dictionary that shows only the n-frequent actors
features_frequency_dictionary=dict(zip(tokens_list, count_list))
d_features = dict((k, v) for k, v in features_frequency_dictionary.items() if v >= 90)
# sorted(d_features.items(), key=lambda kv: kv[1], reverse=True)



Vocabulary length of CountVectorizer of the features corpus: 191341
(191341, 2)
Wall time: 10.7 s


In [38]:
percent_tokenized=0.95
print(f"The total number of feature tokens that exist in the dataset is: {len(c_vectorizer.vocabulary_)}")
countvectorizer_features_words_tokenized=int(len(c_vectorizer.vocabulary_)*percent_tokenized)
print(f"The 95% ({countvectorizer_features_words_tokenized}) of the feature tokens will be tokenized and the rest 5% ({len(c_vectorizer.vocabulary_)-countvectorizer_features_words_tokenized}) of the words will be removed due to sparsity")

The total number of feature tokens that exist in the dataset is: 191341
The 95% (181773) of the feature tokens will be tokenized and the rest 5% (9568) of the words will be removed due to sparsity


### -------------------------------------------------------------------------------------------------------------------------------------
Reviews-CountVectorizer

In [39]:
"""
Find the most frequent words among the movie reviews
In the end the number of rows will be equal to the number of maximum features tokenized by the Tokenizer.
"""
def reviews_split(s):
    return s.split(' ')

corpus_reviews=dataset_nlp_tokenization['clean_reviews'].values.tolist()
c_vectorizer=CountVectorizer(encoding='utf-8', tokenizer=reviews_split, max_df=1.0, min_df=1) #keep this to 1 to include all the words/tokens

c_vectorizer.fit(corpus_reviews)
print(f"Vocabulary length of CountVectorizer of the reviews corpus: {len(c_vectorizer.vocabulary_)}")

X=c_vectorizer.fit_transform(corpus_reviews)
X_words=c_vectorizer.inverse_transform(X)

tokens_list=c_vectorizer.get_feature_names()
count_list = np.asarray(X.sum(axis=0)).ravel().tolist()

token_frequency_df=pd.DataFrame({'term': c_vectorizer.get_feature_names(), 'token_frequency': count_list})
token_frequency_df=token_frequency_df.sort_values(by='token_frequency', ascending=False)
print(token_frequency_df.shape)
#token_frequency_df_pruned_reviews=token_frequency_df[token_frequency_df['token_frequency']>=3]
#print(token_frequency_df_pruned_reviews.shape)

# The below code sample creates a dictionary that shows only the n-frequent actors
reviews_frequency_dictionary=dict(zip(tokens_list, count_list))
d_reviews = dict((k, v) for k, v in reviews_frequency_dictionary.items() if v >= 100)
# sorted(d_reviews.items(), key=lambda kv: kv[1], reverse=True)



Vocabulary length of CountVectorizer of the reviews corpus: 187348
(187348, 2)
Wall time: 21.7 s


In [40]:
"""Get the frequency of word 'film' (i.e in how many reviews this word is used)"""
reviews_frequency_dictionary['film']

44351

In [41]:
percent_tokenized=0.95
print(f"The total number of review tokens that exist in the dataset is: {len(c_vectorizer.vocabulary_)}")
countvectorizer_reviews_words_tokenized=int(len(c_vectorizer.vocabulary_)*percent_tokenized)
print(f"The 95% ({countvectorizer_reviews_words_tokenized}) of the review tokens will be tokenized and the rest 5% ({len(c_vectorizer.vocabulary_)-countvectorizer_reviews_words_tokenized}) of the words will be removed due to sparsity")

The total number of review tokens that exist in the dataset is: 187348
The 95% (177980) of the review tokens will be tokenized and the rest 5% (9368) of the words will be removed due to sparsity


### -------------------------------------------------------------------------------------------------------------------------------------
Title-CountVectorizer

In [98]:
"""
Find the most frequent words among the movie titles
In the end the number of rows will be equal to the number of maximum features tokenized by the Tokenizer.
"""
def movie_title_split(s):
    return s.split(' ')

corpus_title=dataset_nlp_tokenization['clean_movie_title'].values.tolist()
c_vectorizer=CountVectorizer(encoding='utf-8', tokenizer=movie_title_split, max_df=1.0, min_df=1) #keep this to 1 to include all the words/tokens

c_vectorizer.fit(corpus_title)
print(f"Vocabulary length of CountVectorizer of the title corpus: {len(c_vectorizer.vocabulary_)}")

X=c_vectorizer.fit_transform(corpus_title)
X_words=c_vectorizer.inverse_transform(X)

tokens_list=c_vectorizer.get_feature_names()
count_list = np.asarray(X.sum(axis=0)).ravel().tolist()

token_frequency_df=pd.DataFrame({'term': c_vectorizer.get_feature_names(), 'token_frequency': count_list})
token_frequency_df=token_frequency_df.sort_values(by='token_frequency', ascending=False)
print(token_frequency_df.shape)
#token_frequency_df_pruned_title=token_frequency_df[token_frequency_df['token_frequency']>=3]
#print(token_frequency_df_pruned_title.shape)

# The below code sample creates a dictionary that shows only the n-frequent actors
title_frequency_dictionary=dict(zip(tokens_list, count_list))
d_title = dict((k, v) for k, v in title_frequency_dictionary.items() if v >= 100)



Vocabulary length of CountVectorizer of the title corpus: 23462
(23462, 2)


In [43]:
percent_tokenized=0.95
print(f"The total number of movie title tokens that exist in the dataset is: {len(c_vectorizer.vocabulary_)}")
countvectorizer_title_words_tokenized=int(len(c_vectorizer.vocabulary_)*0.95)
print(f"The 95% ({countvectorizer_title_words_tokenized}) of the movie title tokens will be tokenized and the rest 5% ({len(c_vectorizer.vocabulary_)-countvectorizer_title_words_tokenized}) of the words will be removed due to sparsity")

The total number of movie title tokens that exist in the dataset is: 23462
The 95% (22288) of the movie title tokens will be tokenized and the rest 5% (1174) of the words will be removed due to sparsity


In [44]:
"""95% of the words per input that will be tokenized"""
print(f"actors_tokenized: {countvectorizer_actors_tokenized}")
print(f"plot_words_tokenized: {countvectorizer_plot_words_tokenized}")
print(f"features_words_tokenized: {countvectorizer_features_words_tokenized}")
print(f"reviews_words_tokenized: {countvectorizer_reviews_words_tokenized}")
print(f"title_words_tokenized: {countvectorizer_title_words_tokenized}")

actors_tokenized: 247867
plot_words_tokenized: 59881
features_words_tokenized: 181773
reviews_words_tokenized: 177980
title_words_tokenized: 22288


In the code blocks above we implemented the CountVectorizer way to calculate the unique tokens per input
### -------------------------------------------------------------------------------------------------------------------------------------

In [100]:
"""
Data tokenization is one of the most important parts when dealing with text data.
Since I am going to deploy keras models, I use the python api of Keras Tokenizer,
more details about its use on: https://keras.io/preprocessing/text/
"""
print("\nTokenize the dataset (using the keras tokenizer class)\n")
begin_time=time.time()
start_time_one=time.time()
print("------------\nActors corpus\n------------")
vocabulary_size_frequent_words_actors, tokenizer_actors=keras_tokenization("actors", countvectorizer_actors_tokenized, dataset_nlp_tokenization, "clean_actors", X_train_actors, X_test_actors, X_validation_actors, "actors_seq")
print(f"Actors tokenized with maximum number of words: {vocabulary_size_frequent_words_actors}")

# Serialize the Actors Tokenizer
joblib.dump(tokenizer_actors, f"word_tokenizers//actors_tokenizer_{vocabulary_size_frequent_words_actors}_{version_data_control}.pkl")
print(f"Finished the actors corpus tokenization after: {format_timespan(time.time()-start_time_one)}")
#----------------------------------------------------------------------------------------------------------------------

start_time_two=time.time()
print("\n------------\nPlot Summary corpus\n------------")
vocabulary_size_frequent_words_plot, tokenizer_plot=keras_tokenization("plot", countvectorizer_plot_words_tokenized, dataset_nlp_tokenization, "clean_plot_summary", X_train_plot, X_test_plot, X_validation_plot, "plot_seq")
print(f"Plot Summary tokenized with maximum number of words: {vocabulary_size_frequent_words_plot}")

# Serialize the Plot Tokenizer
joblib.dump(tokenizer_plot, f"word_tokenizers//plot_tokenizer_{vocabulary_size_frequent_words_plot}_{version_data_control}.pkl")
print(f"Finished the plot corpus tokenization after: {format_timespan(time.time()-start_time_two)}")
#----------------------------------------------------------------------------------------------------------------------

start_time_three=time.time()
print("\n------------\nMovie Features corpus\n------------")
vocabulary_size_frequent_words_features, tokenizer_features=keras_tokenization("features", countvectorizer_features_words_tokenized, dataset_nlp_tokenization, "clean_combined_features", X_train_features, X_test_features, X_validation_features, "features_seq")
print(f"Movie Features tokenized with maximum number of words: {vocabulary_size_frequent_words_features}")

# Serialize the Movie Features Tokenizer
joblib.dump(tokenizer_features, f"word_tokenizers//features_tokenizer_{vocabulary_size_frequent_words_features}_{version_data_control}.pkl")
print(f"Finished the movie features corpus tokenization after: {format_timespan(time.time()-start_time_three)}")
#----------------------------------------------------------------------------------------------------------------------

start_time_four=time.time()
print("\n------------\nMovie Reviews corpus\n------------")
vocabulary_size_frequent_words_reviews, tokenizer_reviews=keras_tokenization("reviews", countvectorizer_reviews_words_tokenized, dataset_nlp_tokenization, "clean_reviews", X_train_reviews, X_test_reviews, X_validation_reviews, "reviews_seq")
print(f"Movie Reviews tokenized with maximum number of words: {vocabulary_size_frequent_words_reviews}")

# Serialize the Reviews Tokenizer
joblib.dump(tokenizer_reviews, f"word_tokenizers//reviews_tokenizer_{vocabulary_size_frequent_words_reviews}_{version_data_control}.pkl")
print(f"Finished the movie reviews corpus tokenization after: {format_timespan(time.time()-start_time_four)}")
#----------------------------------------------------------------------------------------------------------------------

start_time_five=time.time()
print("\n------------\nMovie Title corpus\n------------")
vocabulary_size_frequent_words_title, tokenizer_title=keras_tokenization("movie_title", countvectorizer_title_words_tokenized, dataset_nlp_tokenization, "clean_movie_title", X_train_title, X_test_title,  X_validation_title, "title_seq")
print(f"Movie Title tokenized with maximum number of words: {vocabulary_size_frequent_words_title}")

# Serialize the Title Tokenizer
joblib.dump(tokenizer_title, f"word_tokenizers//title_tokenizer_{vocabulary_size_frequent_words_title}_{version_data_control}.pkl")
print(f"Finished the movie title corpus tokenization after: {format_timespan(time.time()-start_time_five)}")
#----------------------------------------------------------------------------------------------------------------------

assert len(tokenizer_actors.word_index)==vocabulary_size_frequent_words_actors
assert len(tokenizer_plot.word_index)==vocabulary_size_frequent_words_plot
assert len(tokenizer_features.word_index)==vocabulary_size_frequent_words_features
assert len(tokenizer_reviews.word_index)==vocabulary_size_frequent_words_reviews
assert len(tokenizer_title.word_index)==vocabulary_size_frequent_words_title

print(f"\nFinished tokenization of all 5 trainable columns after: {format_timespan(time.time()-begin_time)}")


Tokenize the dataset (using the keras tokenizer class)

------------
Actors corpus
------------
Maximum length of unique tokens is: 260913
Number of words to be tokenized is the 95% of those unique tokens, equal to: 247867
The rest 5% or 13046 is not tokenized.

Keras Tokenizer result is equal to Count Vectorizer result!
Number of words mapped: 247866. The extra 1 index represents the OVV token, which is not included in the CountVectorizer.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


Actors tokenized with maximum number of words: 247867
Finished the actors corpus tokenization after: 7.09 seconds

------------
Plot Summary corpus
------------
Maximum length of unique tokens is: 63034
Number of words to be tokenized is the 95% of those unique tokens, equal to: 59882
The rest 5% or 3152 is not tokenized.

Keras Tokenizer result is not equal to Count Vectorizer result!
Number of words mapped: 59881. The extra 1 index represents the OVV token, which is not included in the CountVectorizer.
ERROR: The length of the vocabulary is not equal to the number of word_index dictionary, but rather is equal to: 59882
Correct length: 59881.
Plot Summary tokenized with maximum number of words: 59882
Finished the plot corpus tokenization after: 3.11 seconds

------------
Movie Features corpus
------------
Maximum length of unique tokens is: 191341
Number of words to be tokenized is the 95% of those unique tokens, equal to: 181774
The rest 5% or 9567 is not tokenized.

Keras Tokenizer 

In [74]:
"""Serialize in a dictionary the number of words tokenized per input column"""
words_tokenized_per_trainable_feature={}
words_tokenized_per_trainable_feature['actors_tokenized']=vocabulary_size_frequent_words_actors
words_tokenized_per_trainable_feature['plot_words_tokenized']=vocabulary_size_frequent_words_plot
words_tokenized_per_trainable_feature['features_words_tokenized']=vocabulary_size_frequent_words_features
words_tokenized_per_trainable_feature['reviews_words_tokenized']=vocabulary_size_frequent_words_reviews
words_tokenized_per_trainable_feature['title_words_tokenized']=vocabulary_size_frequent_words_title
words_tokenized_per_trainable_feature

{'actors_tokenized': 247867,
 'plot_words_tokenized': 59882,
 'features_words_tokenized': 181773,
 'reviews_words_tokenized': 177982,
 'title_words_tokenized': 22288}

In [75]:
joblib.dump(words_tokenized_per_trainable_feature, f"word_tokenizers//words_tokenized_{version_data_control}.pkl")

['word_tokenizers//words_tokenized_22022021.pkl']

In [79]:
"""
Calculate the maximum length per sequence for each of the five inputs.
Then pad the sequences of that input to the maximum length calculated.
"""
print("\n---------------------------------------------------------------------------------")
print("\nSpecify the length of the maxlen variable (length is a parameter for the optimal padding execution)\n")

maxlen_actors=padding_sequence_length("actors_seq", 95, X_train_actors, X_test_actors, X_validation_actors)
maxlen_plot=padding_sequence_length("plot_seq", 95, X_train_plot, X_test_plot, X_validation_plot)
maxlen_features=padding_sequence_length("features_seq", 95, X_train_features, X_test_features, X_validation_features)
maxlen_reviews=padding_sequence_length("reviews_seq", 95, X_train_reviews, X_test_reviews, X_validation_reviews)
maxlen_title=padding_sequence_length("title_seq", 95, X_train_title, X_test_title, X_validation_title)

print("\n---------------------------------------------------------------------------------")
print("\nCreate the padding sequence of texts\n")

X_train_seq_actors, X_test_seq_actors, X_validation_seq_actors=padding_sequence("actors_seq", X_train_actors, X_test_actors, X_validation_actors, y_train, y_test, y_validation, maxlen_actors)
print("\nActors padded sequences created\n")

X_train_seq_plot, X_test_seq_plot, X_validation_seq_plot=padding_sequence("plot_seq", X_train_plot, X_test_plot, X_validation_plot, y_train, y_test, y_validation, maxlen_plot)
print("Plot padded sequences created\n")

X_train_seq_features, X_test_seq_features, X_validation_seq_features=padding_sequence("features_seq", X_train_features, X_test_features, X_validation_features, y_train, y_test, y_validation, maxlen_features)
print("Movie Features padded sequences created\n")

X_train_seq_reviews, X_test_seq_reviews, X_validation_seq_reviews=padding_sequence("reviews_seq", X_train_reviews, X_test_reviews, X_validation_reviews, y_train, y_test, y_validation, maxlen_reviews)
print("Movie Reviews padded sequences created\n")

X_train_seq_title, X_test_seq_title, X_validation_seq_title=padding_sequence("title_seq", X_train_title, X_test_title, X_validation_title, y_train, y_test, y_validation, maxlen_title)
print("Movie Title padded sequences created")


---------------------------------------------------------------------------------

Specify the length of the maxlen variable (length is a parameter for the optimal padding execution)

Max Length of the pad sequence for actors_seq: 15
Max Length of the pad sequence for plot_seq: 54
Max Length of the pad sequence for features_seq: 91
Max Length of the pad sequence for reviews_seq: 442
Max Length of the pad sequence for title_seq: 6

---------------------------------------------------------------------------------

Create the padding sequence of texts


Actors padded sequences created

Plot padded sequences created

Movie Features padded sequences created

Movie Reviews padded sequences created

Movie Title padded sequences created


#### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

* **Data split:** 80-20 split
* **Non-balanced** dataset

* X_train, X_test, X_validation with <b>80-20</b> split and <b>non-balanced genre</b> tags

In [80]:
print(f"X_train_seq_actors shape: {X_train_seq_actors.shape}")
print(f"X_train_seq_plot shape: {X_train_seq_plot.shape}")
print(f"X_train_seq_features shape: {X_train_seq_features.shape}")
print(f"X_train_seq_reviews shape: {X_train_seq_reviews.shape}")
print(f"X_train_seq_title shape: {X_train_seq_title.shape}\n")

print(f"X_test_seq_actors shape: {X_test_seq_actors.shape}")
print(f"X_test_seq_plot shape: {X_test_seq_plot.shape}")
print(f"X_test_seq_features shape: {X_test_seq_features.shape}")
print(f"X_test_seq_reviews shape: {X_test_seq_reviews.shape}")
print(f"X_test_seq_title shape: {X_test_seq_title.shape}\n")

print(f"X_validation_seq_actors shape: {X_validation_seq_actors.shape}")
print(f"X_validation_seq_plot shape: {X_validation_seq_plot.shape}")
print(f"X_validation_seq_features shape: {X_validation_seq_features.shape}")
print(f"X_validation_seq_reviews shape: {X_validation_seq_reviews.shape}")
print(f"X_validation_seq_title shape: {X_validation_seq_title.shape}")

X_train_seq_actors shape: (31169, 15)
X_train_seq_plot shape: (31169, 54)
X_train_seq_features shape: (31169, 91)
X_train_seq_reviews shape: (31169, 442)
X_train_seq_title shape: (31169, 6)

X_test_seq_actors shape: (9741, 15)
X_test_seq_plot shape: (9741, 54)
X_test_seq_features shape: (9741, 91)
X_test_seq_reviews shape: (9741, 442)
X_test_seq_title shape: (9741, 6)

X_validation_seq_actors shape: (7793, 15)
X_validation_seq_plot shape: (7793, 54)
X_validation_seq_features shape: (7793, 91)
X_validation_seq_reviews shape: (7793, 442)
X_validation_seq_title shape: (7793, 6)


* y_train & y_test with <b>80-20</b> split and <b>non-balanced genre</b> tags

In [81]:
print(f"y_train shape:{y_train.shape}")
print(f"y_test shape:{y_test.shape}")
print(f"y_validation shape:{y_validation.shape}")

y_train shape:(31169, 17)
y_test shape:(9741, 17)
y_validation shape:(7793, 17)


In [82]:
"""Serialized the tokenized and padded sequences per input features. Those sequences will be the main input layer for the NLP classifier on which the model with trained, validated and finally tested"""
split_ratio="80_20"
data_balance="non_balanced"
#Training sample
np.save(os.path.join(os.getcwd(), f"text_tokenized_padded_sequences//x_train_seq_actors_{split_ratio}_{data_balance}_{vocabulary_size_frequent_words_actors}_{version_data_control}"), X_train_seq_actors)
np.save(os.path.join(os.getcwd(), f"text_tokenized_padded_sequences//x_train_seq_plot_{split_ratio}_{data_balance}_{vocabulary_size_frequent_words_plot}_{version_data_control}"), X_train_seq_plot)
np.save(os.path.join(os.getcwd(), f"text_tokenized_padded_sequences//x_train_seq_features_{split_ratio}_{data_balance}_{vocabulary_size_frequent_words_features}_{version_data_control}"), X_train_seq_features)
np.save(os.path.join(os.getcwd(), f"text_tokenized_padded_sequences//x_train_seq_reviews_{split_ratio}_{data_balance}_{vocabulary_size_frequent_words_reviews}_{version_data_control}"), X_train_seq_reviews)
np.save(os.path.join(os.getcwd(), f"text_tokenized_padded_sequences//x_train_seq_title_{split_ratio}_{data_balance}_{vocabulary_size_frequent_words_title}_{version_data_control}"), X_train_seq_title)

#Test sample
np.save(os.path.join(os.getcwd(), f"text_tokenized_padded_sequences//x_test_seq_actors_{split_ratio}_{data_balance}_{vocabulary_size_frequent_words_actors}_{version_data_control}"), X_test_seq_actors)
np.save(os.path.join(os.getcwd(), f"text_tokenized_padded_sequences//x_test_seq_plot_{split_ratio}_{data_balance}_{vocabulary_size_frequent_words_plot}_{version_data_control}"), X_test_seq_plot)
np.save(os.path.join(os.getcwd(), f"text_tokenized_padded_sequences//x_test_seq_features_{split_ratio}_{data_balance}_{vocabulary_size_frequent_words_features}_{version_data_control}"), X_test_seq_features)
np.save(os.path.join(os.getcwd(), f"text_tokenized_padded_sequences//x_test_seq_reviews_{split_ratio}_{data_balance}_{vocabulary_size_frequent_words_reviews}_{version_data_control}"), X_test_seq_reviews)
np.save(os.path.join(os.getcwd(), f"text_tokenized_padded_sequences//x_test_seq_title_{split_ratio}_{data_balance}_{vocabulary_size_frequent_words_title}_{version_data_control}"), X_test_seq_title)

#Validation sample
np.save(os.path.join(os.getcwd(), f"text_tokenized_padded_sequences//x_validation_seq_actors_{split_ratio}_{data_balance}_{vocabulary_size_frequent_words_actors}_{version_data_control}"), X_validation_seq_actors)
np.save(os.path.join(os.getcwd(), f"text_tokenized_padded_sequences//x_validation_seq_plot_{split_ratio}_{data_balance}_{vocabulary_size_frequent_words_plot}_{version_data_control}"), X_validation_seq_plot)
np.save(os.path.join(os.getcwd(), f"text_tokenized_padded_sequences//x_validation_seq_features_{split_ratio}_{data_balance}_{vocabulary_size_frequent_words_features}_{version_data_control}"), X_validation_seq_features)
np.save(os.path.join(os.getcwd(), f"text_tokenized_padded_sequences//x_validation_seq_reviews_{split_ratio}_{data_balance}_{vocabulary_size_frequent_words_reviews}_{version_data_control}"), X_validation_seq_reviews)
np.save(os.path.join(os.getcwd(), f"text_tokenized_padded_sequences//x_validation_seq_title_{split_ratio}_{data_balance}_{vocabulary_size_frequent_words_title}_{version_data_control}"), X_validation_seq_title)

#Dependent variable
np.save(os.path.join(os.getcwd(), f"text_tokenized_padded_sequences//y_train_{split_ratio}_{data_balance}_{version_data_control}"), y_train)
np.save(os.path.join(os.getcwd(), f"text_tokenized_padded_sequences//y_test_{split_ratio}_{data_balance}_{version_data_control}"), y_test)
np.save(os.path.join(os.getcwd(), f"text_tokenized_padded_sequences//y_validation_{split_ratio}_{data_balance}_{version_data_control}"), y_validation)

### -------------------------------------------------------------------------------------------------------------------------------------

#### (1) Serialize the dataset with the added cleaned columns of actors, plot, features, reviews and title for use in part 4 & 5.
#### (2) X_test dataset is already serialized for use in part 3.2.

In [103]:
joblib.dump(dataset_nlp_tokenization, f"{datasets_path}//dataset_part_3.2_and_4_{version_data_control}.pkl")

['C://Users//spano//Desktop//nlp_github//datasets//dataset_part_3.2_and_4_22022021.pkl']

### -------------------------------------------------------------------------------------------------------------------------------------

#### THIS IS THE END OF PART 3.1 - Data Tokenization & Sequence padding