In [575]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import json
# import nltk

ModuleNotFoundError: No module named 'nltk'

In [559]:
path = '../data/movies_metadata.csv'

df = pd.read_csv(path)

  interactivity=interactivity, compiler=compiler, result=result)


## Data Preprocessing

In [560]:
df = pd.concat([df['release_date'], df['title'], df['overview'], df['genres']], axis=1)

# remove duplicates
duplicate_rows = df[df.duplicated()]
df.drop(duplicate_rows.index, inplace=True)

In [561]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45436 entries, 0 to 45465
Data columns (total 4 columns):
release_date    45349 non-null object
title           45430 non-null object
overview        44482 non-null object
genres          45436 non-null object
dtypes: object(4)
memory usage: 1.7+ MB


#### Drop the NaN rows where either title or overview is NaN

In [562]:
# convert empty string to NaN
df['overview'].replace('', np.nan, inplace=True)
df.dropna(subset=['release_date', 'title', 'overview'], inplace=True)
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44405 entries, 0 to 45465
Data columns (total 4 columns):
release_date    44405 non-null object
title           44405 non-null object
overview        44405 non-null object
genres          44405 non-null object
dtypes: object(4)
memory usage: 1.7+ MB


Unnamed: 0,release_date,title,overview,genres
count,44405,44405,44405,44405
unique,17184,41303,44232,4045
top,2008-01-01,Cinderella,No overview found.,"[{'id': 18, 'name': 'Drama'}]"
freq,135,11,133,4904


#### Drop rows with no overview info or blank

In [563]:
reg_404 = "^not available|^no overview"
overview_not_found = df['overview'].str.contains(reg_404, regex=True, flags=re.IGNORECASE)
overview_blank = df['overview'].str.isspace()

df.drop(df[overview_not_found].index, inplace=True)
df.drop(df[overview_blank].index, inplace=True)
df.describe()

  


Unnamed: 0,release_date,title,overview,genres
count,44253,44253,44253,44253
unique,17159,41160,44223,4042
top,2008-01-01,Cinderella,A few funny little novels about different aspe...,"[{'id': 18, 'name': 'Drama'}]"
freq,132,11,3,4885


In [564]:
df

Unnamed: 0,release_date,title,overview,genres
0,1995-10-30,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,1995-12-15,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,1995-12-22,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ..."
3,1995-12-22,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
4,1995-02-10,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[{'id': 35, 'name': 'Comedy'}]"
...,...,...,...,...
45460,1991-05-13,Robin Hood,"Yet another version of the classic epic, with ...","[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name..."
45462,2011-11-17,Century of Birthing,An artist struggles to finish his work while a...,"[{'id': 18, 'name': 'Drama'}]"
45463,2003-08-01,Betrayal,"When one of her hits goes wrong, a professiona...","[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam..."
45464,1917-10-21,Satan Triumphant,"In a small town live two brothers, one a minis...",[]


#### Transform column genre

In [565]:
def extract_genres(genres_str):
    genres_str = genres_str.replace("'", '\"')
    genres_json = json.loads(genres_str)
    genres_list = []
    for elem in genres_json:
        genres_list.append(elem['name'])
    return genres_list

In [566]:
# remove rows with no genres, since they don't provide any information
df.drop(df[df['genres'] == '[]'].index, inplace=True)

# transform genres from string to list
temp_genre = df['genres'].apply(extract_genres)

In [567]:
# test conversion to list went ok
g_set = set()
for i, row in df['genres'].iteritems():
    reg = ''
    for genre in temp_genre[i]:
        reg = reg + '(?=.*' + genre + ')'
        g_set.add(genre)
    if not re.search(reg, row) or len(temp_genre[i]) == 0:
        print('FAILED: at i =', i , row)
        print(reg)
        break

In [568]:
df['genres'] = temp_genre
df

Unnamed: 0,release_date,title,overview,genres
0,1995-10-30,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]"
1,1995-12-15,Jumanji,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]"
2,1995-12-22,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[Romance, Comedy]"
3,1995-12-22,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[Comedy, Drama, Romance]"
4,1995-02-10,Father of the Bride Part II,Just when George Banks has recovered from his ...,[Comedy]
...,...,...,...,...
45458,2000-10-03,The Burkittsville 7,A film archivist revisits the story of Rustin ...,[Horror]
45459,1995-01-01,Caged Heat 3000,It's the year 3000 AD. The world's most danger...,[Science Fiction]
45460,1991-05-13,Robin Hood,"Yet another version of the classic epic, with ...","[Drama, Action, Romance]"
45462,2011-11-17,Century of Birthing,An artist struggles to finish his work while a...,[Drama]


#### Transfrom genres with one-hot encoding

In [569]:
all_genres = set(sum(df['genres'], []))
all_genres = set(all_genres)

In [570]:
len(all_genres)

20

In [574]:
# all_genres = nltk.FreqDist(all_genres) 

# # create dataframe
# all_genres_df = pd.DataFrame({'Genre': list(all_genres.keys()), 
#                               'Count': list(all_genres.values())})

NameError: name 'nltk' is not defined