In [109]:
import pandas as pd
import json
import numpy as np
from collections import Counter

<h3>Import the datasets</h3>

In [110]:
movie_metadata = pd.read_csv("/Users/mazichang/Desktop/movie_classification/MovieSummaries/movie.metadata.tsv", sep="\t", header=None)
movie_plot = pd.read_csv("/Users/mazichang/Desktop/movie_classification/MovieSummaries/plot_summaries.txt", sep="\t", header=None)

movie_metadata.columns = ['Wiki_ID', 'Freebase_ID', 'name', 'release_date', 'box_office', 
                          'runtime', 'languages', 'contries', 'genres']
movie_plot.columns = ['Wiki_ID', 'plot']

<h3>Check the datasets</h3>

In [111]:
print(movie_metadata.shape)
movie_metadata.head()

(81741, 9)


Unnamed: 0,Wiki_ID,Freebase_ID,name,release_date,box_office,runtime,languages,contries,genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


In [112]:
print(movie_plot.shape)
movie_plot.head()

(42303, 2)


Unnamed: 0,Wiki_ID,plot
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...


<h3>Merge the datasets on Wiki ID and keep the useful rows</h3>

In [113]:
movie = pd.merge(movie_metadata, movie_plot, how = 'inner', on = 'Wiki_ID')
movie = movie[['Wiki_ID', 'name', 'genres', 'plot']]
movie.head()

Unnamed: 0,Wiki_ID,name,genres,plot
0,975900,Ghosts of Mars,"{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...","Set in the second half of the 22nd century, th..."
1,9363483,White Of The Eye,"{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...",A series of murders of rich young women throug...
2,261236,A Woman in Flames,"{""/m/07s9rl0"": ""Drama""}","Eva, an upper class housewife, becomes frustra..."
3,18998739,The Sorcerer's Apprentice,"{""/m/0hqxf"": ""Family Film"", ""/m/01hmnh"": ""Fant...","Every hundred years, the evil Morgana returns..."
4,6631279,Little city,"{""/m/06cvj"": ""Romantic comedy"", ""/m/0hj3n0w"": ...","Adam, a San Francisco-based artist who works a..."


<h3>Eliminate the freebase ID in genres</h3>

In [114]:
for index, row in movie.iterrows():
    genres = row["genres"]
    genres_dict = json.loads(genres)
    genres_list = list(genres_dict.values())
    genres_list_str = ",".join(genres_list)
    movie.loc[index,'genres'] = genres_list_str

In [115]:
from nltk.tokenize import sent_tokenize, word_tokenize
import matplotlib
import matplotlib.pyplot as plt

<h3>Eliminate outliers (plots with less than 5 or more than 40 sentences)</h3>

In [116]:
invalid_record = []
for index, row in movie.iterrows():
    movie_plot = row['plot']
    number_of_sentences = len(sent_tokenize(movie_plot))
    if number_of_sentences <= 5 or number_of_sentences >= 40:
        invalid_record.append(index)
movie.drop(invalid_record, inplace=True)

<h3>Select 16 most popular genres</h3>

In [117]:
genres_parse_dict = dict()

genres_parse_dict['Drama'] = ['Drama']
genres_parse_dict['Comedy-drama'] = ['Comedy', 'Drama']
genres_parse_dict['Romantic drama'] = ['Romance', 'Drama']
genres_parse_dict['Comedy'] = ['Comedy']
genres_parse_dict['Comedy film'] = ['Comedy']
genres_parse_dict['Romantic comedy'] = ['Romance', 'Comedy']
genres_parse_dict['Romance Film'] = ['Romance']
genres_parse_dict['Thriller'] = ['Thriller']
genres_parse_dict['Psychological thriller'] = ['Thriller']
genres_parse_dict['Crime Thriller'] = ['Crime', 'Thriller']
genres_parse_dict['Action'] = ['Action Adventure']
genres_parse_dict['Action/Adventure'] = ['Action Adventure']
genres_parse_dict['Adventure'] = ['Action Adventure']
genres_parse_dict['World cinema'] = ['World cinema']
genres_parse_dict['Crime Fiction'] = ['Crime']
genres_parse_dict['Horror'] = ['Horror']
genres_parse_dict['Family Film'] = ['Family Film']
genres_parse_dict["Children's/Family"] = ['Family Film']
genres_parse_dict['Animation'] = ['Animation']
genres_parse_dict['Science Fiction'] = ['Science Fiction']
genres_parse_dict['Fantasy'] = ['Fantasy']
genres_parse_dict['Mystery'] = ['Mystery']
genres_parse_dict['War film'] = ['War film']
genres_parse_dict['Period piece'] = ['Period piece']
genres_parse_dict['Western'] = ['Western']

<h3>Only keep the movies with the selected genres</h3>

In [118]:
for index,rows in movie.iterrows():
    genres = movie.loc[index, 'genres'].split(",")
    genres_to_keep = []
    
    for x in genres:
        if x in genres_parse_dict.keys():
            genres_to_keep.extend(genres_parse_dict[x])
        else:
            continue
    
    genres_to_keep = list(set(genres_to_keep))
    
    if len(genres_to_keep) != 0:
        movie.loc[index, 'genres'] = ",".join(genres_to_keep)
    else:
        movie.loc[index, 'genres'] = np.nan

In [119]:
movie.dropna(inplace=True)
movie

Unnamed: 0,Wiki_ID,name,genres,plot
0,975900,Ghosts of Mars,"Action Adventure,Science Fiction,Thriller,Horror","Set in the second half of the 22nd century, th..."
1,9363483,White Of The Eye,Thriller,A series of murders of rich young women throug...
2,261236,A Woman in Flames,Drama,"Eva, an upper class housewife, becomes frustra..."
3,18998739,The Sorcerer's Apprentice,"Family Film,Action Adventure,World cinema,Fantasy","Every hundred years, the evil Morgana returns..."
4,6631279,Little city,"Drama,Comedy,Romance","Adam, a San Francisco-based artist who works a..."
...,...,...,...,...
42194,1918494,State and Main,Comedy,Havoc is wrought on the inhabitants of a small...
42195,664006,Guilty as Sin,"Crime,Thriller",Jennifer Haines is an up-and-coming Chicago a...
42196,3868432,Into the Mirror,Horror,After accidentally causing the death of his pa...
42202,913762,The Super Dimension Fortress Macross II: Lover...,"Animation,Action Adventure,Drama,Science Fiction","The story takes place in the year 2092,The Sup..."


<h3>Sanity check: Ensure there are 16 selected genres</h3>

In [120]:
genres_list = []
for index,rows in movie.iterrows():
    for x in movie.loc[index, 'genres'].split(","):
        genres_list.append(x)

print(len(set(genres_list)))
for genre in set(genres_list):
    print(genre)

16
Action Adventure
Thriller
Western
Crime
Romance
Animation
Family Film
Comedy
War film
Fantasy
Mystery
Science Fiction
Drama
World cinema
Period piece
Horror


<h3>Save cleaned dataset to file</h3>

In [102]:
movie
movie.to_csv("movie_cleaned.csv", sep="\t", index=False)