# Bring in raw dataset

In [2]:
import pandas as pd

df_movies_raw = pd.read_csv('./Data/movies.csv')

df_movies_raw.head()

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0


In [None]:
# check the shape of the dataframe
df_movies_raw.shape   #7668,15, which matches the shape of the original dataset

(7668, 15)

## Preliminary cleanup, just upon first glance

* clean the release date to be a standard date by separating into
    * release_date
    * country
* create separate field for release year in order to join to the inflation multiplier data
* get all distinct values which could be mistyped in other rows; e.g. Star Robert Redford (8), Star Robbert Redford (1); That would create a problem when grouping
* for the budget and gross where things are UK, figure out if they're using dollars or euros/pounds and adjust accordingly
    * confirmed IMDB data is in US $$
* compare the budget/gross to historic online data to determine if the inflation adjustments were already done
    * "Jaws 3D" shows $88M gross in both dataset and on wikipedia https://en.wikipedia.org/wiki/Jaws_3-D
    * "Things are Tough All Over" shows $21M gross in both dataset and wikipedia https://en.wikipedia.org/wiki/Things_Are_Tough_All_Over
    * This seems to indicate that the dataset has not already been adjusted for inflation, meaning we need to as part of our analysis
* determine prevalence of missing gross/budget data and see if it's worth filling in
 

In [None]:

# split the 'released' column into 'release_date' and 'country'
df_movies_raw[['release_date', 'country']] = df_movies_raw['released'].str.extract(r'^(.*?)(?:\s*\((.*?)\))?$')
df_movies_raw.head

In [14]:
df_movies_raw

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime,release_date
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United States,19000000.0,46998772.0,Warner Bros.,146.0,"June 13, 1980"
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0,"July 2, 1980"
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0,"June 20, 1980"
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0,"July 2, 1980"
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0,"July 25, 1980"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7663,More to Life,,Drama,2020,"October 23, 2020 (United States)",3.1,18.0,Joseph Ebanks,Joseph Ebanks,Shannon Bond,United States,7000.0,,,90.0,"October 23, 2020"
7664,Dream Round,,Comedy,2020,"February 7, 2020 (United States)",4.7,36.0,Dusty Dukatz,Lisa Huston,Michael Saquella,United States,,,Cactus Blue Entertainment,90.0,"February 7, 2020"
7665,Saving Mbango,,Drama,2020,"April 27, 2020 (Cameroon)",5.7,29.0,Nkanya Nkwai,Lynno Lovert,Onyama Laura,Cameroon,58750.0,,Embi Productions,,"April 27, 2020"
7666,It's Just Us,,Drama,2020,"October 1, 2020 (United States)",,,James Randall,James Randall,Christina Roz,United States,15000.0,,,120.0,"October 1, 2020"


In [None]:
# transform release_date to datetime
df_movies_raw['release_date'] = pd.to_datetime(df_movies_raw['release_date'], errors='coerce')
df_movies_raw.T

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime,release_date
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United States,19000000.0,46998772.0,Warner Bros.,146.0,1980-06-13
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0,1980-07-02
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0,1980-06-20
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0,1980-07-02
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0,1980-07-25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7663,More to Life,,Drama,2020,"October 23, 2020 (United States)",3.1,18.0,Joseph Ebanks,Joseph Ebanks,Shannon Bond,United States,7000.0,,,90.0,2020-10-23
7664,Dream Round,,Comedy,2020,"February 7, 2020 (United States)",4.7,36.0,Dusty Dukatz,Lisa Huston,Michael Saquella,United States,,,Cactus Blue Entertainment,90.0,2020-02-07
7665,Saving Mbango,,Drama,2020,"April 27, 2020 (Cameroon)",5.7,29.0,Nkanya Nkwai,Lynno Lovert,Onyama Laura,Cameroon,58750.0,,Embi Productions,,2020-04-27
7666,It's Just Us,,Drama,2020,"October 1, 2020 (United States)",,,James Randall,James Randall,Christina Roz,United States,15000.0,,,120.0,2020-10-01


In [None]:
# add release year column to cleanly join later
df_movies_raw['release_year'] = df_movies_raw['release_date'].dt.year


In [None]:
# checking added column
df_movies_raw.T

In [39]:
# get adjusted_dollars.csv into a dataframe
df_adjusted_dollars = pd.read_csv('./Data/adjusted_dollars.csv')
df_adjusted_dollars.shape



(113, 2)

In [None]:
# merge df_movies_raw with df_adjusted_dollars on year and add InflationMultiplier field to df_movies_raw
df_movies = pd.merge(df_movies_raw, df_adjusted_dollars[['Year', 'InflationMultiplier']], left_on=df_movies_raw['release_year'], right_on='Year', how='left')
# drop the redundant Year field
df_movies = df_movies.drop(columns=['Year'])


In [37]:
# add a new column for adjusted gross
df_movies['adjusted_gross'] = df_movies['gross'] * df_movies['InflationMultiplier']



In [38]:
df_movies


Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime,release_date,release_year,InflationMultiplier,adjusted_gross
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United States,19000000.0,46998772.0,Warner Bros.,146.0,1980-06-13,1980.0,3.89,1.828252e+08
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0,1980-07-02,1980.0,3.89,2.289386e+08
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0,1980-06-20,1980.0,3.89,2.094279e+09
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0,1980-07-02,1980.0,3.89,3.246343e+08
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0,1980-07-25,1980.0,3.89,1.550023e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7663,More to Life,,Drama,2020,"October 23, 2020 (United States)",3.1,18.0,Joseph Ebanks,Joseph Ebanks,Shannon Bond,United States,7000.0,,,90.0,2020-10-23,2020.0,1.24,
7664,Dream Round,,Comedy,2020,"February 7, 2020 (United States)",4.7,36.0,Dusty Dukatz,Lisa Huston,Michael Saquella,United States,,,Cactus Blue Entertainment,90.0,2020-02-07,2020.0,1.24,
7665,Saving Mbango,,Drama,2020,"April 27, 2020 (Cameroon)",5.7,29.0,Nkanya Nkwai,Lynno Lovert,Onyama Laura,Cameroon,58750.0,,Embi Productions,,2020-04-27,2020.0,1.24,
7666,It's Just Us,,Drama,2020,"October 1, 2020 (United States)",,,James Randall,James Randall,Christina Roz,United States,15000.0,,,120.0,2020-10-01,2020.0,1.24,


## Data validity checks

### In order to attribute box office $$ to any particular star, we have to ensure that any one star is recorded exactly the same, e.g. "Robert DeNiro" is not also listed as "Robert De Niro". To do this we will get a list of unique star names and do a vector angle analysis to give similarity score with threshhold >=7.5

### We need to do the same for writer, director, country, genre and company as well


In [64]:
def similarity_check(field):
    print(f"\nChecking field: {field}")
    unique_values = df_movies[field].dropna().unique()
    tfidf_matrix = TfidfVectorizer().fit_transform(unique_values)
    vectors = tfidf_matrix.toarray()
    cosine_matrix = cosine_similarity(vectors)
    similar_pairs = []
    for i in range(len(unique_values)):
        for j in range(i + 1, len(unique_values)):
            if cosine_matrix[i, j] > threshold and unique_values[i] != unique_values[j]:
                similar_pairs.append((unique_values[i], unique_values[j], cosine_matrix[i, j]))
    for val1, val2, score in similar_pairs:
        print(f"Similar: '{val1}' and '{val2}' with similarity score: {score:.2f}")

# list of fields to loop through to check for similar entries
fields_to_check = ['star', 'writer', 'director', 'country', 'genre', 'company']

for field in fields_to_check:
    similarity_check(field)


Checking field: star
Similar: 'Jason Scott Lee' and 'Jason Lee' with similarity score: 0.82
Similar: 'Michael Jordan' and 'Michael B. Jordan' with similarity score: 1.00

Checking field: writer
Similar: 'Richard Matheson' and 'Richard Christian Matheson' with similarity score: 0.80
Similar: 'Lawrence J. Block' and 'Lawrence Block' with similarity score: 1.00
Similar: 'Bruce Rubin' and 'Bruce Joel Rubin' with similarity score: 0.83
Similar: 'Robert Katz' and 'A L Katz' with similarity score: 0.85
Similar: 'W.S. Gilbert' and 'David Gilbert' with similarity score: 0.86
Similar: 'Daniel Petrie Jr.' and 'Daniel Petrie' with similarity score: 0.84
Similar: 'William Kennedy' and 'William P. Kennedy' with similarity score: 1.00
Similar: 'Peter Prince' and 'Prince' with similarity score: 0.82
Similar: 'Paul Hogan' and 'P.J. Hogan' with similarity score: 0.83
Similar: 'S.S. Wilson' and 'Michael G. Wilson' with similarity score: 0.83
Similar: 'S.S. Wilson' and 'David C. Wilson' with similarity s

In [63]:

# list of fields to loop through to check for similar entries
fields_to_check = ['star', 'writer', 'director', 'country', 'genre', 'company']

for field in fields_to_check:
    similarity_check(field)
    
threshold = 0.80

for field in fields_to_check:
    print(f"\nChecking field: {field}")
    unique_values = df_movies[field].dropna().unique()
    tfidf_matrix = TfidfVectorizer().fit_transform(unique_values)
    vectors = tfidf_matrix.toarray()
    cosine_matrix = cosine_similarity(vectors)
    similar_pairs = []
    for i in range(len(unique_values)):
        for j in range(i + 1, len(unique_values)):
            if cosine_matrix[i, j] > threshold and unique_values[i] != unique_values[j]:
                similar_pairs.append((unique_values[i], unique_values[j], cosine_matrix[i, j]))
    for val1, val2, score in similar_pairs:
        print(f"Similar: '{val1}' and '{val2}' with similarity score: {score:.2f}")

NameError: name 'similarity_check' is not defined

## analysis of the companies reveals that major studios have subsidiaries that should be folded in with their parents for the purpose of analysis

In [None]:
# create new field in df_movies called 'company_grouped' to hold the company name initially
df_movies['company_grouped'] = df_movies['company']

# update company_grouped = 'Huayi Brothers' where company contains 'Huayi Brothers'
df_movies.loc[df_movies['company'].str.contains('Huayi Brothers', na=False), 'company_grouped'] = 'Huayi Brothers'

# update company_grouped = 'Cannon Films' where company contains 'Cannon'
df_movies.loc[df_movies['company'].str.contains('Cannon', na=False), 'company_grouped'] = 'Cannon Films'

# update company_grouped = 'Warner Brothers' where company contains 'Warner Bro'
df_movies.loc[df_movies['company'].str.contains('Warner Bro', na=False), 'company_grouped'] = 'Warner Brothers'

# update company_grouped = 'lucasfilm' where company contains 'lucasfilm'
df_movies.loc[df_movies['company'].str.contains('Lucasfilm', na=False), 'company_grouped'] = 'Luscasfilm'

# update company_grouped = 'United Artists' where company contains 'lucasfilm'
df_movies.loc[df_movies['company'].str.contains('United Artists', na=False), 'company_grouped'] = 'United Artists'

# update company_grouped = 'Walt Disney' where company contains 'Walt Disney'
df_movies.loc[df_movies['company'].str.contains('Walt Disney', na=False), 'company_grouped'] = 'Walt Disney'

# update company_grouped = 'Samuel Goldwyn' where company contains 'Samuel Goldwyn'
df_movies.loc[df_movies['company'].str.contains('Samuel Goldwyn', na=False), 'company_grouped'] = 'Samual Goldwyn'

# update company_grouped = 'Brownstone Productions' where company contains 'brownstone'
df_movies.loc[df_movies['company'].str.contains('Brownstone', na=False), 'company_grouped'] = 'Brownstone Productions'

# update company_grouped = 'HBO' where company contains 'HBO'
df_movies.loc[df_movies['company'].str.contains('HBO', na=False), 'company_grouped'] = 'HBO'

# update company_grouped = 'Polygram' where company contains 'Polygram'
df_movies.loc[df_movies['company'].str.contains('PolyGram', na=False), 'company_grouped'] = 'Polygram Filmed Entertainment'

# update company_grouped = 'Anapurna' where company contains 'Anapurna'
df_movies.loc[df_movies['company'].str.contains('Anapurna', na=False), 'company_grouped'] = 'Anapurna'

# update company_grouped = 'Penta' where company contains 'Anapurna'
df_movies.loc[df_movies['company'].str.contains('Penta', na=False), 'company_grouped'] = 'Penta'

# update company_grouped = 'Ben-Ami/Leeds Productions' where company contains 'Ben-Ami'
df_movies.loc[df_movies['company'].str.contains('Ben-Ami', na=False), 'company_grouped'] = 'Ben-Ami/Leeds Productions'

# update company_grouped = '21st Century Films' where company contains '21st Century Film'
df_movies.loc[df_movies['company'].str.contains('21st Century Films', na=False), 'company_grouped'] = '21st Century Film'

# update company_grouped = 'Lions Gate' where company contains 'Lions Gate'
df_movies.loc[df_movies['company'].str.contains('Lions Gate', na=False), 'company_grouped'] = 'Lions Gate'

# update company_grouped = 'Alliance' where company contains 'Alliance
df_movies.loc[df_movies['company'].str.contains('Alliance', na=False), 'company_grouped'] = 'Alliance'

# update company_grouped = 'Dreamworks' where company contains 'Dreamworks
df_movies.loc[df_movies['company'].str.contains('Dreamworks', na=False), 'company_grouped'] = 'Dreamworks'

# update company_grouped = 'IFC' where company contains 'IFC
df_movies.loc[df_movies['company'].str.contains('IFC', na=False), 'company_grouped'] = 'IFC'

# update company_grouped = 'Abandon' where company contains 'IFC
df_movies.loc[df_movies['company'].str.contains('Abandon', na=False), 'company_grouped'] = 'Abandon'

# update company_grouped = 'Warp' where company contains 'Warp
df_movies.loc[df_movies['company'].str.contains('Warp', na=False), 'company_grouped'] = 'Warp'

# update company_grouped = 'Filmax' where company contains 'Warp
df_movies.loc[df_movies['company'].str.contains('Filmax', na=False), 'company_grouped'] = 'Filmax'

# update company_grouped = 'Filmax' where company contains 'Warp
df_movies.loc[df_movies['company'].str.contains('Dino De Laurentiis ', na=False), 'company_grouped'] = 'Dino De Laurentiis Company'

# re-do the similarity analisys on company_grouped
fields_to_check = ['company_grouped']
threshold = 0.80

for field in fields_to_check:
    print(f"\nChecking field: {field}")
    #unique_values = df_movies[field].dropna().unique() & ~df_movies[field].str.contains('.')][field].unique()
    unique_values = df_movies[df_movies[field].notna() & ~df_movies[field].astype(str).str.contains('.', regex=False)][field].unique()

    tfidf_matrix = TfidfVectorizer().fit_transform(unique_values)
    vectors = tfidf_matrix.toarray()
    cosine_matrix = cosine_similarity(vectors)
    similar_pairs = []
    for i in range(len(unique_values)):
        for j in range(i + 1, len(unique_values)):
            if cosine_matrix[i, j] > threshold and unique_values[i] != unique_values[j]:
                similar_pairs.append((unique_values[i], unique_values[j], cosine_matrix[i, j]))
    for val1, val2, score in similar_pairs:
        print(f"Similar: '{val1}' and '{val2}' with similarity score: {score:.2f}")


Checking field: company_grouped
Similar: 'Columbia Pictures' and 'Columbia Films' with similarity score: 0.84
Similar: 'Rastar Pictures' and 'Rastar Films' with similarity score: 0.86
Similar: 'Rastar Pictures' and 'Rastar Productions' with similarity score: 0.86
Similar: 'Twentieth Century Fox' and 'Twentieth Century Fox Animation' with similarity score: 0.89
Similar: 'Rastar Films' and 'Rastar Productions' with similarity score: 0.88
Similar: 'Broadway Productions' and 'Broadway Pictures' with similarity score: 0.86
Similar: 'Renaissance Pictures' and 'Renaissance Films' with similarity score: 0.87
Similar: 'Gaumont' and 'Gaumont International' with similarity score: 0.84
Similar: 'Dino De Laurentiis Company' and 'Dino de Laurentiis Communications' with similarity score: 0.83
Similar: 'Embassy Pictures' and 'Embassy International Pictures' with similarity score: 0.86
Similar: 'R&R Films' and 'A&M Films' with similarity score: 1.00
Similar: 'Channel Four Film' and 'Channel Four Films

In [52]:
# show me records from df_movies where company name contains 'Brownstone'
df_movies[df_movies['company'].str.contains('Brownstone', na=False)]



Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime,release_date,release_year,InflationMultiplier,adjusted_gross,company_grouped
6048,Pitch Perfect,PG-13,Comedy,2012,"October 5, 2012 (United States)",7.1,285000.0,Jason Moore,Kay Cannon,Anna Kendrick,United States,17000000.0,115350426.0,Brownstone Productions (II),112.0,2012-10-05,2012.0,1.39,160337100.0,Brownstone Productions (II)
6689,Pitch Perfect 2,PG-13,Comedy,2015,"May 15, 2015 (United States)",6.4,146000.0,Elizabeth Banks,Kay Cannon,Anna Kendrick,United States,29000000.0,287144079.0,Brownstone Productions (II),115.0,2015-05-15,2015.0,1.35,387644500.0,Brownstone Productions (II)
7095,Pitch Perfect 3,PG-13,Comedy,2017,"December 22, 2017 (United States)",5.8,58000.0,Trish Sie,Kay Cannon,Anna Kendrick,United States,45000000.0,185400345.0,Brownstone Entertainment (II),93.0,2017-12-22,2017.0,1.31,242874500.0,Brownstone Entertainment (II)
