In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast

In [8]:
master_dataset = pd.read_csv('master_dataset.csv')
print(master_dataset.shape)

(47072, 29)


In [9]:
def safe_literal_eval(data):
    if isinstance(data, (list, dict)):
        return data
    if isinstance(data, str):
        try:
            return ast.literal_eval(data)
        except (ValueError, SyntaxError):
            return []
    return []

for column in ['cast', 'crew', 'keywords']:
    master_dataset[column] = master_dataset[column].apply(safe_literal_eval)

In [10]:
def get_director(x):
    """
    Extract the Name of the Director for a movie if it is present inside the job
    """
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [11]:
master_dataset['cast']      = master_dataset['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
master_dataset['cast']      = master_dataset['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

master_dataset['keywords']  = master_dataset['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

master_dataset['director']  = master_dataset['crew'].apply(get_director)

In [12]:
master_dataset['cast']          = master_dataset['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

master_dataset['main_director'] = master_dataset['director']

master_dataset['director']      = master_dataset['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
master_dataset['director']      = master_dataset['director'].apply(lambda x: [x,x,x])

In [13]:
s = master_dataset.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'
s = s.value_counts()
print(s[:5])

keyword
woman director      3144
independent film    1958
murder              1322
based on novel       849
musical              734
Name: count, dtype: int64


In [14]:
s = s[s > 1]

In [15]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')

master_dataset['keywords'] = master_dataset['keywords'].apply(
    lambda x: [stemmer.stem(i) for i in x if len(i) > 1]
)

master_dataset['keywords'] = master_dataset['keywords'].apply(
    lambda x: [i.replace(" ", "").lower() for i in x]
)


In [16]:
master_dataset['keywords'].head(3)

0    [jealousi, toy, boy, friendship, friend, rival...
1    [boardgam, disappear, basedonchildren'sbook, n...
2       [fish, bestfriend, duringcreditssting, oldmen]
Name: keywords, dtype: object

In [17]:
for col in ['keywords', 'cast', 'director', 'genres']:
    master_dataset[col] = master_dataset[col].apply(lambda x: x if isinstance(x, list) else [])

master_dataset['soup'] = master_dataset['keywords'] + master_dataset['cast'] + master_dataset['director'] + master_dataset['genres']

master_dataset['soup'] = master_dataset['soup'].apply(lambda x: ' '.join([str(i) for i in x]))


In [18]:
master_dataset['soup'].head(3)

0    jealousi toy boy friendship friend rivalri boy...
1    boardgam disappear basedonchildren'sbook newho...
2    fish bestfriend duringcreditssting oldmen walt...
Name: soup, dtype: object

In [19]:
print(master_dataset.columns)

Index(['movieId', 'imdbId', 'id', 'cast', 'crew', 'keywords', 'adult',
       'belongs_to_collection', 'budget', 'genres', 'homepage', 'imdb_id',
       'original_language', 'original_title', 'overview', 'popularity',
       'poster_path', 'production_companies', 'production_countries',
       'release_date', 'revenue', 'runtime', 'spoken_languages', 'status',
       'tagline', 'title', 'video', 'vote_average', 'vote_count', 'director',
       'main_director', 'soup'],
      dtype='object')


In [20]:
master_dataset.to_csv('master_dataset_new.csv', index=False)