In [1]:
import json
import pandas as pd
import numpy as np
import re

In [2]:
fileDir = "resources/"
wikiMoviesRaw = []
with open(f'{fileDir}wikipedia-movies.json', mode='r') as file:
    wikiMoviesRaw = json.load(file)


In [3]:
kaggleMetadata = pd.read_csv(f'{fileDir}movies_metadata.csv', low_memory=False)
ratings = pd.read_csv(f'{fileDir}ratings.csv')

In [4]:
wikiMoviesDF = pd.DataFrame(wikiMoviesRaw)

In [5]:
wiki_movies = [movie for movie in wikiMoviesRaw if ('Director' in movie or 'Directed by' in movie)
    and 'imdb_link' in movie
    and 'No. of episodes' not in movie]
len(wiki_movies)

7076

In [6]:
def clean_movie(movie):
    movie = dict(movie)
    altTitles = {}
    for key in ['Arabic', 'Cantonese', 'Chinese', 'French', 'Hangul', 'Hebrew', 'Hepburn', 'Japanese', 'Literally',  'Mandarin', 'McCune-Reischauer', 'Original title', 'Polish',  'Revised Romanization', 'Romanized', 'Russian', 'Simplified', 'Traditional', 'Yiddish']:
        if key in movie:
            altTitles[key] = movie[key]
            movie.pop(key)
    if len(altTitles) > 0:
        movie['alternate_titles'] = altTitles
        # merge column names
    def change_column_name(old_name, new_name):
        if old_name in movie:
            movie[new_name] = movie.pop(old_name)
    change_column_name('Adaptation by', 'Writer(s)')
    change_column_name('Country of origin', 'Country')
    change_column_name('Directed by', 'Director')
    change_column_name('Distributed by', 'Distributor')
    change_column_name('Edited by', 'Editor(s)')
    change_column_name('Length', 'Running time')
    change_column_name('Original release', 'Release date')
    change_column_name('Music by', 'Composer(s)')
    change_column_name('Produced by', 'Producer(s)')
    change_column_name('Producer', 'Producer(s)')
    change_column_name('Productioncompanies ', 'Production company(s)')
    change_column_name('Productioncompany ', 'Production company(s)')
    change_column_name('Released', 'Release Date')
    change_column_name('Release Date', 'Release date')
    change_column_name('Screen story by', 'Writer(s)')
    change_column_name('Screenplay by', 'Writer(s)')
    change_column_name('Story by', 'Writer(s)')
    change_column_name('Theme music composer', 'Composer(s)')
    change_column_name('Written by', 'Writer(s)')
    return movie

In [1]:
cleanMovie = [clean_movie(movie) for movie in wiki_movies]
wikiMoviesDF = pd.DataFrame(cleanMovie)

NameError: name 'wiki_movies' is not defined

In [None]:
wikiMoviesDF['imdb_id'] = wikiMoviesDF['imdb_link'].str.extract(r'(tt\d{7})')


In [None]:
wikiMoviesDF.drop_duplicates(subset='imdb_id', inplace=True)


In [None]:
wiki_columns_to_keep = [column for column in wikiMoviesDF.columns if wikiMoviesDF[column].isnull().sum() < len(wikiMoviesDF) * 0.9]
wikiMoviesDF = wikiMoviesDF[wiki_columns_to_keep]


In [None]:
boxOffice = wikiMoviesDF["Box office"].dropna()

In [None]:
#def isNotAString(x):
#    return type(x) != str
boxOffice[boxOffice.map(lambda x: type(x) != str)]


34                           [US$, 4,212,828]
54      [$6,698,361 (, United States, ), [2]]
74                    [$6,488,144, (US), [1]]
126                [US$1,531,489, (domestic)]
130                          [US$, 4,803,039]
                        ...                  
6980               [$99.6, million, [4], [5]]
6994                   [$365.6, million, [1]]
6995                         [$53.8, million]
7015                     [$435, million, [7]]
7048                   [$529.3, million, [4]]
Name: Box office, Length: 135, dtype: object

In [None]:
formOne = r'\$\s*\d+\.?\d*\s*[mb]illi?on'
formOneMatches = boxOffice.str.contains(formOne, flags=re.IGNORECASE, na=False)
formOneMatches.sum()

3833

In [None]:
formTwo = r'\$\s*\d{1,3}(?:[,\.]\d{3})+'
formTwoMatches = boxOffice.str.contains(formTwo, flags=re.IGNORECASE, na=False)
formTwoMatches.sum()

1528

In [None]:
boxOffice[( ~formOneMatches) & (~formTwoMatches)]

34      NaN
54      NaN
74      NaN
126     NaN
130     NaN
       ... 
6980    NaN
6994    NaN
6995    NaN
7015    NaN
7048    NaN
Name: Box office, Length: 154, dtype: object

In [None]:
boxOffice = boxOffice.str.replace(r'\$.*[-—–](?![a-z])', '$', regex=True)


In [None]:
boxOffice.str.extract(f'({formOne}|{formTwo})')

Unnamed: 0,0
0,$21.4 million
1,$2.7 million
2,"$57,718,089"
3,"$7,331,647"
4,"$6,939,946"
...,...
7070,$19.4 million
7071,$41.9 million
7072,$76.1 million
7073,$38.4 million


In [None]:
def parseDollars(string):
    if type(string) != str:
        return np.NaN
    # $###.# million and billion
    if re.match(r'\$\s*\d+\.?\d*\s*milli?on', string, flags=re.IGNORECASE):
        return float(re.sub('\$|\s|[a-zA-Z]','',string)) * 1000000
    elif re.match(r'\$\s*\d+\.?\d*\s*billi?on', string, flags=re.IGNORECASE):
        return float(re.sub('\$|\s|[a-zA-Z]','',string)) * 1000000000
    #  $###,###,###.##
    elif re.match(r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illion)', string, flags=re.IGNORECASE):
        return float(re.sub('\$|,','',string))
    else:
        return np.nan


In [None]:
wikiMoviesDF['boxOffice'] = boxOffice.str.extract(f'({formOne}|{formTwo})', flags=re.IGNORECASE)[0].apply(parseDollars)

In [None]:
wikiMoviesDF.drop('Box office', axis=1, inplace=True)

In [None]:
budget = wikiMoviesDF['Budget'].dropna()
budget = budget.map(lambda x: ' '.join(x) if type(x)==list else x)
budget = budget.str.replace(r'\$.*[-—–](?![a-z])', '$', regex=True)

In [None]:
matchesFormOne = budget.str.contains(formOne, flags=re.IGNORECASE, na=False)
matchesFormTwo = budget.str.contains(formTwo, flags=re.IGNORECASE, na=False)
len(budget[~matchesFormOne & ~matchesFormTwo])

38

In [None]:
budget = budget.str.replace(r'\[\d+\]\s*','', regex=True)

budget[~matchesFormOne & ~matchesFormTwo]

136                         Unknown
204     60 million Norwegian Kroner
478                         Unknown
973                     $34 million
1126                   $120 million
1226                        Unknown
1278                            HBO
1374                     £6,000,000
1397                     13 million
1480                   £2.8 million
1734                   CAD2,000,000
1913     PHP 85 million (estimated)
1948                    102,888,900
1953                   3,500,000 DM
1973                     ₤2,300,874
2281                     $14 milion
2451                     ₤6,350,000
3144                   € 40 million
3360                   $150 million
3418                        $218.32
3802                   £4.2 million
3906                            N/A
3959                    760,000 USD
4470                       19 crore
4641                    £17 million
5034                  $$200 million
5055                   $155 million
5419                    $40 

In [None]:
#wikiMoviesDF['budget']=budget.str.extract(f'({formOne}|{formTwo})', flags=re.IGNORECASE)[0]

In [None]:
budget.str.extract(f'({formOne}|{formTwo})', flags=re.IGNORECASE)

Unnamed: 0,0
0,$20 million
1,$6 million
2,$35 million
3,$12 million
4,$25 million
...,...
7070,
7071,$42 million
7072,$60 million
7073,$20 million
