In [15]:
import pandas as pd
from io import StringIO
import re

for_pd = StringIO()
with open('../prize_dataset/movie_titles.csv', encoding = 'ISO-8859-1') as movie_titles:
    for line in movie_titles:
        new_line = re.sub(r',', '|', line.rstrip(), count=2)
        print (new_line, file=for_pd)

for_pd.seek(0)

df = pd.read_csv(for_pd, sep='|', header=None, names=['MovieID', 'Year', 'Name'])
df

Unnamed: 0,MovieID,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW
...,...,...,...
17765,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004.0,Fidel Castro: American Experience
17767,17768,2000.0,Epoch
17768,17769,2003.0,The Company


In [16]:
import glob
import json
import os
import isodate


data = glob.glob('data/*.json')
for file in data:
    file_name = os.path.basename(file).split('_')[0].replace("|", "/")
    #print(file_name)
    with open(file, 'r') as f:
        json_data = json.load(f)
        imdbID = json_data['url']

        # get list of actors
        try:
            actors = [x['name'] for x in json_data['actor'] if x.get('name', '') != '']
            actors = ', '.join(actors)
        except:
            actors = None

        # get aggregate average rating
        try:
            aggregateAvgRating = json_data['aggregateRating']['ratingValue']
        except:
            aggregateAvgRating = None

        # get number of ratings
        try:
            numRatings = json_data['aggregateRating']['ratingCount']
        except:
            numRatings = None

        # get content rating
        contentRating = json_data.get('contentRating', '')

        # get creators
        try:
            creators = [x['name'] for x in json_data['creator'] if x.get('name', '') != '']
            creators = ', '.join(creators)
        except:
            creators = None

        # get description
        description = json_data.get('description', '')

        # get directors
        try:
            directors = [x['name'] for x in json_data['director'] if x.get('name', '') != '']
            directors = ', '.join(directors)
        except:
            directors = None

        # get duration
        try:
            iso_duration = json_data['duration']
            duration = isodate.parse_duration(iso_duration)
            duration = int(duration.total_seconds() / 60) # in minutes
        except:
            duration = None
            
        # get genre
        try:
            genre = ', '.join(json_data['genre'])
        except:
            genre = None

        # get keywords
        keywords = json_data.get('keywords')
            

        
        df.loc[df['Name'].str.split(',').str[0] == file_name, ['imdbID', 'AggregateAverageRating', 'NumRating', 'Directors', 'creators', 'Genre', 'Keywords', 'description', 'duration', 'actors', 'contentRating']] = \
                                                               [imdbID, aggregateAvgRating,  numRatings,  directors, creators, genre, keywords, description, duration, actors, contentRating]
        # df.loc[df['Name'].str.split(',').str[0] == file_name, 'imdbYear'] = year

In [17]:
df
df.to_csv('merge4.csv', index=False)

In [18]:
df

Unnamed: 0,MovieID,Year,Name,imdbID,AggregateAverageRating,NumRating,Directors,creators,Genre,Keywords,description,duration,actors,contentRating
0,1,2003.0,Dinosaur Planet,/title/tt0389605/,7.7,531.0,,,"Documentary, Animation, Family","feathered dinosaur,natural disaster,cg animati...",A four-episode animated series charting the ad...,,"Christian Slater, Ana Claudia Talancón, Bruno ...",Not Rated
1,2,2004.0,Isle of Man TT 2004 Review,,,,,,,,,,,
2,3,1997.0,Character,/title/tt0119448/,7.7,11026.0,Mike van Diem,"Ferdinand Bordewijk, Laurens Geels, Mike van Diem","Crime, Drama, Mystery","gunfight,street shootout,police shootout,fianc...","Jacob Katadreuffe lives mute with his mother, ...",122.0,"Pavlik Jansen op de Haar, Jan Decleir, Fedja v...",R
3,4,1994.0,Paula Abdul's Get Up & Dance,/title/tt0277226/,,,"Jonathan Dayton, Valerie Faris",,Music,"tap dancing,music video,20th century,1980s,pop...",Paula Abdul&apos;s music video collection with...,35.0,"Paula Abdul, Bill Bohl, Dee Caspary",
4,5,2004.0,The Rise and Fall of ECW,/title/tt0440751/,,,Kevin Dunn,Paul Heyman,"Documentary, Sport","eastern championship wrestling,drinking beer,s...",A documentary on the rise and fall of cult wre...,360.0,"Abdullah the Butcher, Donna Adamo, Bill Alfonso",TV-MA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17765,17766,2002.0,Where the Wild Things Are and Other Maurice Se...,,,,,,,,,,,
17766,17767,2004.0,Fidel Castro: American Experience,,,,,,,,,,,
17767,17768,2000.0,Epoch,/title/tt0282466/,7.2,23.0,Vincent Lee,David Tzeng,"Action, Drama, Fantasy",,"The story takes place in the distant future, w...",,"Ho-Sung Pak, T.J. Storm, Robert Alonzo",
17768,17769,2003.0,The Company,/title/tt0335013/,6.2,6695.0,Robert Altman,"Neve Campbell, Barbara Turner","Drama, Music, Romance","snapped achilles trendon,female nudity,nudity,...",A young ballet dancer is poised to become the ...,112.0,"Neve Campbell, James Franco, Malcolm McDowell",PG-13


In [56]:
import subprocess

CMD = '''
on run argv
  display notification (item 2 of argv) with title (item 1 of argv)
end run
'''

def notify(title, text):
  subprocess.call(['osascript', '-e', CMD, title, text])

# Example uses:
notify("Your script done", "Heres an alert")