## Part 2 - Transforming and Cleaning the columns of the final dataset

In [None]:
version_data_control="20072020"

#### Import the libraries

In [None]:
import pandas as pd
import numpy as np
import os

# Module to serialize the content produced from the execution of the code

import pickle

# Module to monitor the progress of a python for loop

from tqdm import tqdm_notebook

import re

def set_pandas_display_options() -> None:
    display = pd.options.display

    display.max_columns = 1000
    display.max_rows = 1000
    display.max_colwidth = 199
    display.width = None
    # display.precision = 2  # set as needed

set_pandas_display_options()

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#### Import the dataset

In [None]:
"""
dataset_part_1_25012020 = final_dataset_49393_movies_25012020 are the same table files produced from Part 1.
"""
dataset = pd.read_pickle(os.path.join(os.getcwd(), 'pickled_data_per_part\\dataset_part_1_25012020.pkl'))

dataset = dataset.reset_index(drop=True)
dataset.shape

In [None]:
dataset.head()

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

### Section 2.1: Cleaning the columns

##### Section 2.1.1: Create a separate column for the year the movie was published

In [None]:
pattern = '\((\d{4})\)'
dataset['year'] =dataset.title.str.extract(pattern, expand=False) #False returns a series
dataset['year'].iloc[0]

# - - - - - - - - -  - - - - - - - - - - - - - - - - - -- - - - - - - - - - - - - - - - - -

##### Section 2.1.2: Clean the year (i.e 2001) from each title - 25.10.2019

In [None]:
exp = r'\(\d\d\d\d.'

dataset['title'] = dataset['title'].apply(lambda x: re.sub(exp,"",x).strip())

In [None]:
dataset.head()

# - - - - - - - - -  - - - - - - - - - - - - - - - - - -- - - - - - - - - - - - - - - - - -

##### Section 2.1.3: Replace (,The) & (,A), (, An) from the end of the string, at the front + Remove any text inside paranthesis

In [None]:
dataset[dataset.director == "Christopher Nolan"]

#### Comment: As we can see "Dark Knight Rises, The" is not correct and should be corrected to "The Dark Knight Rises".

In [None]:
dataset['title']=dataset['title'].str.replace(r'(.*?),?\s*(The|A|An|Les)?(?=\s*\(.*\)\s*|$).*', r'\2 \1')

In [None]:
dataset['title'] = dataset['title'].str.strip()

In [None]:
# title: Dark Knight, The has been successfully changed to The Dark Knight 
dataset.iloc[12422]

# - - - - - - - - -  - - - - - - - - - - - - - - - - - -- - - - - - - - - - - - - - - - - -

##### Step 2.1.4: Clean some punctuation mistakes in columns plot summary and user reviews.

In [None]:
dataset['plot'] = dataset['plot'].apply(lambda x: x.strip())
dataset['plot'] = dataset['plot'].apply(lambda x: x.replace(',', ', '))
dataset['plot'] = dataset['plot'].apply(lambda x: x.replace('.', '. '))
dataset['plot'] = dataset['plot'].apply(lambda x: x.replace('?', '? '))
dataset['plot'] = dataset['plot'].apply(lambda x: x.replace('!', '! '))
dataset['plot'] = dataset['plot'].apply(lambda x: x.replace('\n                    See full summary\xa0»', ''))
dataset['plot'] = dataset['plot'].apply(lambda x: x.rstrip())
dataset['plot'] = dataset['plot'].apply(lambda x: re.sub(' +', ' ', x))

In [None]:
dataset['reviews'] = dataset['reviews'].apply(lambda x: [user_review.strip() for user_review in x])
dataset['reviews'] = dataset['reviews'].apply(lambda x: [user_review.replace(',', ', ') for user_review in x])
dataset['reviews'] = dataset['reviews'].apply(lambda x: [user_review.replace('.', '. ') for user_review in x])
dataset['reviews'] = dataset['reviews'].apply(lambda x: [user_review.replace('?', '? ') for user_review in x])
dataset['reviews'] = dataset['reviews'].apply(lambda x: [user_review.replace('!', '! ') for user_review in x])
dataset['reviews'] = dataset['reviews'].apply(lambda x: [user_review.rstrip() for user_review in x])
dataset['reviews'] = dataset['reviews'].apply(lambda x: [re.sub(' +', ' ', user_review) for user_review in x])

In [None]:
# display(dataset['plot'][dataset['title'].str.contains("Come Look at Me")])
display(dataset['plot'][dataset['title']=="Come Look at Me"])

# - - - - - - - - -  - - - - - - - - - - - - - - - - - -- - - - - - - - - - - - - - - - - -

##### Step 2.1.5: Remove empty summaries

In [None]:
len(dataset[dataset['plot'] == 'Add a Plot »'])
# Thus 176 mocies should be removed, due to UNKNOWN plot summary text

In [None]:
dataset=dataset[dataset['plot'] != 'Add a Plot »']
dataset.shape

# - - - - - - - - -  - - - - - - - - - - - - - - - - - -- - - - - - - - - - - - - - - - - -

##### Step 2.1.6: Create the positive/negative column

In [None]:
dataset.loc[:, 'sentiment_value'] = dataset['rating']
dataset.loc[:, 'sentiment_value'].loc[dataset.rating >=2.5] = 1
dataset.loc[:, 'sentiment_value'].loc[dataset.rating <2.5] = 0

# - - - - - - - - -  - - - - - - - - - - - - - - - - - -- - - - - - - - - - - - - - - - - -

##### Step 2.1.7 Create the column Combined Features (29.10.2019)

A very important columns, since it combines content from the important columns of the dataframe.
Specifically the text of:
* Title,
* Actors,
* Director,
* Plot summary,
* Genres

In [None]:
def combine_features(row):
    return row['title'] + " " + ' '.join(map(str, row['actors'])) + " " + row['director'] + " " + row['plot'] + " " + ' '.join(row['genres'])

dataset["movie_features"] = dataset.apply(combine_features, axis=1)

In [None]:
dataset["movie_features"].iloc[0]

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#### Pickle the dataset (16.02.2020)

Old version on 22.04.2020 <br>
Old version on 13.07.2020 <br>
Latest version on 20.07.2020 (added the 'year' of movie release column)

In [None]:
dataset.to_pickle(os.path.join(os.getcwd(), 'pickled_data_per_part\\dataset_part_2_{0}.pkl'.format(version_data_control)))

#### End of part 2 (Transforming the columns of the final dataset)