# Bring in raw dataset

In [2]:
import pandas as pd

df_movies_raw = pd.read_csv('./Data/movies.csv')

df_movies_raw.head()

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0


In [3]:
# check the shape of the dataframe
df_movies_raw.shape   #7668,15, which matches the shape of the original dataset

(7668, 15)

## Preliminary cleanup, just upon first glance

* clean the release date to be a standard date by separating into
    * release_date
    * country
* create separate field for release year in order to join to the inflation multiplier data
* get all distinct values which could be mistyped in other rows; e.g. Star Robert Redford (8), Star Robbert Redford (1); That would create a problem when grouping
* for the budget and gross where things are UK, figure out if they're using dollars or euros/pounds and adjust accordingly
    * confirmed IMDB data is in US $$
* compare the budget/gross to historic online data to determine if the inflation adjustments were already done
    * "Jaws 3D" shows $88M gross in both dataset and on wikipedia https://en.wikipedia.org/wiki/Jaws_3-D
    * "Things are Tough All Over" shows $21M gross in both dataset and wikipedia https://en.wikipedia.org/wiki/Things_Are_Tough_All_Over
    * This seems to indicate that the dataset has not already been adjusted for inflation, meaning we need to as part of our analysis
* determine prevalence of missing gross/budget data and see if it's worth filling in
 

In [4]:

# split the 'released' column into 'release_date' and 'country'
df_movies_raw[['release_date', 'country']] = df_movies_raw['released'].str.extract(r'^(.*?)(?:\s*\((.*?)\))?$')
df_movies_raw.head

<bound method NDFrame.head of                                                 name rating      genre  year  \
0                                        The Shining      R      Drama  1980   
1                                    The Blue Lagoon      R  Adventure  1980   
2     Star Wars: Episode V - The Empire Strikes Back     PG     Action  1980   
3                                          Airplane!     PG     Comedy  1980   
4                                         Caddyshack      R     Comedy  1980   
...                                              ...    ...        ...   ...   
7663                                    More to Life    NaN      Drama  2020   
7664                                     Dream Round    NaN     Comedy  2020   
7665                                   Saving Mbango    NaN      Drama  2020   
7666                                    It's Just Us    NaN      Drama  2020   
7667                                       Tee em el    NaN     Horror  2020   

         

In [5]:
df_movies_raw

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime,release_date
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United States,19000000.0,46998772.0,Warner Bros.,146.0,"June 13, 1980"
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0,"July 2, 1980"
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0,"June 20, 1980"
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0,"July 2, 1980"
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0,"July 25, 1980"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7663,More to Life,,Drama,2020,"October 23, 2020 (United States)",3.1,18.0,Joseph Ebanks,Joseph Ebanks,Shannon Bond,United States,7000.0,,,90.0,"October 23, 2020"
7664,Dream Round,,Comedy,2020,"February 7, 2020 (United States)",4.7,36.0,Dusty Dukatz,Lisa Huston,Michael Saquella,United States,,,Cactus Blue Entertainment,90.0,"February 7, 2020"
7665,Saving Mbango,,Drama,2020,"April 27, 2020 (Cameroon)",5.7,29.0,Nkanya Nkwai,Lynno Lovert,Onyama Laura,Cameroon,58750.0,,Embi Productions,,"April 27, 2020"
7666,It's Just Us,,Drama,2020,"October 1, 2020 (United States)",,,James Randall,James Randall,Christina Roz,United States,15000.0,,,120.0,"October 1, 2020"


In [6]:
# transform release_date to datetime
df_movies_raw['release_date'] = pd.to_datetime(df_movies_raw['release_date'], errors='coerce')
df_movies_raw.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7658,7659,7660,7661,7662,7663,7664,7665,7666,7667
name,The Shining,The Blue Lagoon,Star Wars: Episode V - The Empire Strikes Back,Airplane!,Caddyshack,Friday the 13th,The Blues Brothers,Raging Bull,Superman II,The Long Riders,...,Black Wall Street Burning,I Am Fear,Aloha Surf Hotel,Love by Drowning,The Robinsons,More to Life,Dream Round,Saving Mbango,It's Just Us,Tee em el
rating,R,R,PG,PG,R,R,R,R,PG,R,...,R,Not Rated,,R,,,,,,
genre,Drama,Adventure,Action,Comedy,Comedy,Horror,Action,Biography,Action,Biography,...,Drama,Horror,Comedy,Drama,Action,Drama,Comedy,Drama,Drama,Horror
year,1980,1980,1980,1980,1980,1980,1980,1980,1980,1980,...,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020
released,"June 13, 1980 (United States)","July 2, 1980 (United States)","June 20, 1980 (United States)","July 2, 1980 (United States)","July 25, 1980 (United States)","May 9, 1980 (United States)","June 20, 1980 (United States)","December 19, 1980 (United States)","June 19, 1981 (United States)","May 16, 1980 (United States)",...,"February 7, 2020 (United States)","March 3, 2020 (United States)","November 5, 2020 (United States)","November 6, 2020 (United States)","November 10, 2020 (United States)","October 23, 2020 (United States)","February 7, 2020 (United States)","April 27, 2020 (Cameroon)","October 1, 2020 (United States)","August 19, 2020 (United States)"
score,8.4,5.8,8.7,7.7,7.3,6.4,7.9,8.2,6.8,7.0,...,6.6,3.4,7.1,,,3.1,4.7,5.7,,5.7
votes,927000.0,65000.0,1200000.0,221000.0,108000.0,123000.0,188000.0,330000.0,101000.0,10000.0,...,35.0,447.0,14.0,,,18.0,36.0,29.0,,7.0
director,Stanley Kubrick,Randal Kleiser,Irvin Kershner,Jim Abrahams,Harold Ramis,Sean S. Cunningham,John Landis,Martin Scorsese,Richard Lester,Walter Hill,...,Marcus Brown,Kevin Shulman,Stefan C. Schaefer,Justin Kreinbrink,Directors,Joseph Ebanks,Dusty Dukatz,Nkanya Nkwai,James Randall,Pereko Mosia
writer,Stephen King,Henry De Vere Stacpoole,Leigh Brackett,Jim Abrahams,Brian Doyle-Murray,Victor Miller,Dan Aykroyd,Jake LaMotta,Jerry Siegel,Bill Bryden,...,Dekoven Riggins,Kevin Shulman,Stefan C. Schaefer,C.E. Poverman,Aleks Alifirenko Jr.,Joseph Ebanks,Lisa Huston,Lynno Lovert,James Randall,Pereko Mosia
star,Jack Nicholson,Brooke Shields,Mark Hamill,Robert Hays,Chevy Chase,Betsy Palmer,John Belushi,Robert De Niro,Gene Hackman,David Carradine,...,Dan Belcher,Kristina Klebe,Augie Tulba,Nicky Whelan,Billy Hartmann,Shannon Bond,Michael Saquella,Onyama Laura,Christina Roz,Siyabonga Mabaso


In [7]:
# add release year column to cleanly join later
df_movies_raw['release_year'] = df_movies_raw['release_date'].dt.year


In [8]:
# checking added column
df_movies_raw.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7658,7659,7660,7661,7662,7663,7664,7665,7666,7667
name,The Shining,The Blue Lagoon,Star Wars: Episode V - The Empire Strikes Back,Airplane!,Caddyshack,Friday the 13th,The Blues Brothers,Raging Bull,Superman II,The Long Riders,...,Black Wall Street Burning,I Am Fear,Aloha Surf Hotel,Love by Drowning,The Robinsons,More to Life,Dream Round,Saving Mbango,It's Just Us,Tee em el
rating,R,R,PG,PG,R,R,R,R,PG,R,...,R,Not Rated,,R,,,,,,
genre,Drama,Adventure,Action,Comedy,Comedy,Horror,Action,Biography,Action,Biography,...,Drama,Horror,Comedy,Drama,Action,Drama,Comedy,Drama,Drama,Horror
year,1980,1980,1980,1980,1980,1980,1980,1980,1980,1980,...,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020
released,"June 13, 1980 (United States)","July 2, 1980 (United States)","June 20, 1980 (United States)","July 2, 1980 (United States)","July 25, 1980 (United States)","May 9, 1980 (United States)","June 20, 1980 (United States)","December 19, 1980 (United States)","June 19, 1981 (United States)","May 16, 1980 (United States)",...,"February 7, 2020 (United States)","March 3, 2020 (United States)","November 5, 2020 (United States)","November 6, 2020 (United States)","November 10, 2020 (United States)","October 23, 2020 (United States)","February 7, 2020 (United States)","April 27, 2020 (Cameroon)","October 1, 2020 (United States)","August 19, 2020 (United States)"
score,8.4,5.8,8.7,7.7,7.3,6.4,7.9,8.2,6.8,7.0,...,6.6,3.4,7.1,,,3.1,4.7,5.7,,5.7
votes,927000.0,65000.0,1200000.0,221000.0,108000.0,123000.0,188000.0,330000.0,101000.0,10000.0,...,35.0,447.0,14.0,,,18.0,36.0,29.0,,7.0
director,Stanley Kubrick,Randal Kleiser,Irvin Kershner,Jim Abrahams,Harold Ramis,Sean S. Cunningham,John Landis,Martin Scorsese,Richard Lester,Walter Hill,...,Marcus Brown,Kevin Shulman,Stefan C. Schaefer,Justin Kreinbrink,Directors,Joseph Ebanks,Dusty Dukatz,Nkanya Nkwai,James Randall,Pereko Mosia
writer,Stephen King,Henry De Vere Stacpoole,Leigh Brackett,Jim Abrahams,Brian Doyle-Murray,Victor Miller,Dan Aykroyd,Jake LaMotta,Jerry Siegel,Bill Bryden,...,Dekoven Riggins,Kevin Shulman,Stefan C. Schaefer,C.E. Poverman,Aleks Alifirenko Jr.,Joseph Ebanks,Lisa Huston,Lynno Lovert,James Randall,Pereko Mosia
star,Jack Nicholson,Brooke Shields,Mark Hamill,Robert Hays,Chevy Chase,Betsy Palmer,John Belushi,Robert De Niro,Gene Hackman,David Carradine,...,Dan Belcher,Kristina Klebe,Augie Tulba,Nicky Whelan,Billy Hartmann,Shannon Bond,Michael Saquella,Onyama Laura,Christina Roz,Siyabonga Mabaso


In [9]:
# get adjusted_dollars.csv into a dataframe
df_adjusted_dollars = pd.read_csv('./Data/adjusted_dollars.csv')
df_adjusted_dollars.shape



(113, 2)

In [10]:
# merge df_movies_raw with df_adjusted_dollars on year and add InflationMultiplier field to df_movies_raw
df_movies = pd.merge(df_movies_raw, df_adjusted_dollars[['Year', 'InflationMultiplier']], left_on=df_movies_raw['release_year'], right_on='Year', how='left')
# drop the redundant Year field
df_movies = df_movies.drop(columns=['Year'])


In [11]:
# add a new column for adjusted gross
df_movies['adjusted_gross'] = df_movies['gross'] * df_movies['InflationMultiplier']



## Data validity checks

### In order to attribute box office $$ to any particular star, we have to ensure that any one star is recorded exactly the same, e.g. "Robert DeNiro" is not also listed as "Robert De Niro". To do this we will get a list of unique star names and do a vector angle analysis to give similarity score with threshhold >=7.5

### We need to do the same for writer, director, country, genre and company as well


In [18]:

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

threshold = 0.80

def similarity_check(field):
    print(f"\nChecking field: {field}")
    unique_values = df_movies[field].dropna().unique()
    tfidf_matrix = TfidfVectorizer().fit_transform(unique_values)
    vectors = tfidf_matrix.toarray()
    cosine_matrix = cosine_similarity(vectors)
    similar_pairs = []
    for i in range(len(unique_values)):
        for j in range(i + 1, len(unique_values)):
            if cosine_matrix[i, j] > threshold and unique_values[i] != unique_values[j]:
                similar_pairs.append((unique_values[i], unique_values[j], cosine_matrix[i, j]))
    for val1, val2, score in similar_pairs:
        print(f"Similar: '{val1}' and '{val2}' with similarity score: {score:.2f}")

# list of fields to loop through to check for similar entries
fields_to_check = ['star', 'writer', 'director', 'country', 'genre', 'company']

for field in fields_to_check:
    similarity_check(field)


Checking field: star
Similar: 'Jason Scott Lee' and 'Jason Lee' with similarity score: 0.82
Similar: 'Michael Jordan' and 'Michael B. Jordan' with similarity score: 1.00

Checking field: writer
Similar: 'Richard Matheson' and 'Richard Christian Matheson' with similarity score: 0.80
Similar: 'Lawrence J. Block' and 'Lawrence Block' with similarity score: 1.00
Similar: 'Bruce Rubin' and 'Bruce Joel Rubin' with similarity score: 0.83
Similar: 'Robert Katz' and 'A L Katz' with similarity score: 0.85
Similar: 'W.S. Gilbert' and 'David Gilbert' with similarity score: 0.86
Similar: 'Daniel Petrie Jr.' and 'Daniel Petrie' with similarity score: 0.84
Similar: 'William Kennedy' and 'William P. Kennedy' with similarity score: 1.00
Similar: 'Peter Prince' and 'Prince' with similarity score: 0.82
Similar: 'Paul Hogan' and 'P.J. Hogan' with similarity score: 0.83
Similar: 'S.S. Wilson' and 'Michael G. Wilson' with similarity score: 0.83
Similar: 'S.S. Wilson' and 'David C. Wilson' with similarity s

## analysis of the companies reveals that major studios have subsidiaries that should be folded in with their parents for the purpose of analysis

In [20]:
# create new field in df_movies called 'company_grouped' to hold the company name initially
df_movies['company_grouped'] = df_movies['company']

# update company_grouped = 'Huayi Brothers' where company contains 'Huayi Brothers'
df_movies.loc[df_movies['company'].str.contains('Huayi Brothers', na=False), 'company_grouped'] = 'Huayi Brothers'

# update company_grouped = 'Cannon Films' where company contains 'Cannon'
df_movies.loc[df_movies['company'].str.contains('Cannon', na=False), 'company_grouped'] = 'Cannon Films'

# update company_grouped = 'Warner Brothers' where company contains 'Warner Bro'
df_movies.loc[df_movies['company'].str.contains('Warner Bro', na=False), 'company_grouped'] = 'Warner Brothers'

# update company_grouped = 'lucasfilm' where company contains 'lucasfilm'
df_movies.loc[df_movies['company'].str.contains('Lucasfilm', na=False), 'company_grouped'] = 'Luscasfilm'

# update company_grouped = 'United Artists' where company contains 'lucasfilm'
df_movies.loc[df_movies['company'].str.contains('United Artists', na=False), 'company_grouped'] = 'United Artists'

# update company_grouped = 'Walt Disney' where company contains 'Walt Disney'
df_movies.loc[df_movies['company'].str.contains('Walt Disney', na=False), 'company_grouped'] = 'Walt Disney'

# update company_grouped = 'Samuel Goldwyn' where company contains 'Samuel Goldwyn'
df_movies.loc[df_movies['company'].str.contains('Samuel Goldwyn', na=False), 'company_grouped'] = 'Samual Goldwyn'

# update company_grouped = 'Brownstone Productions' where company contains 'brownstone'
df_movies.loc[df_movies['company'].str.contains('Brownstone', na=False), 'company_grouped'] = 'Brownstone Productions'

# update company_grouped = 'HBO' where company contains 'HBO'
df_movies.loc[df_movies['company'].str.contains('HBO', na=False), 'company_grouped'] = 'HBO'

# update company_grouped = 'Polygram' where company contains 'Polygram'
df_movies.loc[df_movies['company'].str.contains('PolyGram', na=False), 'company_grouped'] = 'Polygram Filmed Entertainment'

# update company_grouped = 'Anapurna' where company contains 'Anapurna'
df_movies.loc[df_movies['company'].str.contains('Anapurna', na=False), 'company_grouped'] = 'Anapurna'

# update company_grouped = 'Penta' where company contains 'Anapurna'
df_movies.loc[df_movies['company'].str.contains('Penta', na=False), 'company_grouped'] = 'Penta'

# update company_grouped = 'Ben-Ami/Leeds Productions' where company contains 'Ben-Ami'
df_movies.loc[df_movies['company'].str.contains('Ben-Ami', na=False), 'company_grouped'] = 'Ben-Ami/Leeds Productions'

# update company_grouped = '21st Century Films' where company contains '21st Century Film'
df_movies.loc[df_movies['company'].str.contains('21st Century Films', na=False), 'company_grouped'] = '21st Century Film'

# update company_grouped = 'Lions Gate' where company contains 'Lions Gate'
df_movies.loc[df_movies['company'].str.contains('Lions Gate', na=False), 'company_grouped'] = 'Lions Gate'

# update company_grouped = 'Alliance' where company contains 'Alliance
df_movies.loc[df_movies['company'].str.contains('Alliance', na=False), 'company_grouped'] = 'Alliance'

# update company_grouped = 'Dreamworks' where company contains 'Dreamworks
df_movies.loc[df_movies['company'].str.contains('Dreamworks', na=False), 'company_grouped'] = 'DreamWorks'

# update company_grouped = 'IFC' where company contains 'IFC
df_movies.loc[df_movies['company'].str.contains('IFC', na=False), 'company_grouped'] = 'IFC'

# update company_grouped = 'Abandon' where company contains 'IFC
df_movies.loc[df_movies['company'].str.contains('Abandon', na=False), 'company_grouped'] = 'Abandon'

# update company_grouped = 'Warp' where company contains 'Warp
df_movies.loc[df_movies['company'].str.contains('Warp', na=False), 'company_grouped'] = 'Warp'

# update company_grouped = 'Filmax' where company contains 'Filmax
df_movies.loc[df_movies['company'].str.contains('Filmax', na=False), 'company_grouped'] = 'Filmax'

# update company_grouped = 'Dino De Laurentis' where company contains 'Dino de Laurentiis
df_movies.loc[df_movies['company'].str.contains('Dino De Laurentiis ', na=False), 'company_grouped'] = 'Dino De Laurentis Company'

# update company_grouped = '21st Century' where company contains '21st Century'
df_movies.loc[df_movies['company'].str.contains('21st Century', na=False), 'company_grouped'] = '21st Century'

# update company_grouped = 'BBC' where company contains '21st Century'
df_movies.loc[df_movies['company'].str.contains('BBC', na=False), 'company_grouped'] = 'BBC'

# update company_grouped = 'Embassy' where company contains 'Embassy'
df_movies.loc[df_movies['company'].str.contains('Embassy', na=False), 'company_grouped'] = 'Embassy'

# update company_grouped = 'Guber-Peters Company' where company contains 'Guber-Peters Company'
df_movies.loc[df_movies['company'].str.contains('Guber-Peters Company', na=False), 'company_grouped'] = 'Guber-Peters Company'

# update company_grouped = 'Pathé' where company contains 'Pathé'
df_movies.loc[df_movies['company'].str.contains('Pathé', na=False), 'company_grouped'] = 'Pathé'

# update company_grouped = 'Mirage' where company contains 'Mirage'
df_movies.loc[df_movies['company'].str.contains('Mirage', na=False), 'company_grouped'] = 'Mirage'

# update company_grouped = 'Pressman' where company contains 'Pressman'
df_movies.loc[df_movies['company'].str.contains('Pressman', na=False), 'company_grouped'] = 'Pressman'

# update company_grouped = 'Carolco' where company contains 'Carolco'
df_movies.loc[df_movies['company'].str.contains('Carolco', na=False), 'company_grouped'] = 'Carolco'

# update company_grouped = 'Annapurna' where company contains 'Annapurna'
df_movies.loc[df_movies['company'].str.contains('Annapurna', na=False), 'company_grouped'] = 'Annapurna'

# update company_grouped = 'Constantin' where company contains 'Constantin'
df_movies.loc[df_movies['company'].str.contains('Constantin', na=False), 'company_grouped'] = 'Constantin'

# update company_grouped = 'New World' where company contains 'New World'
df_movies.loc[df_movies['company'].str.contains('New World', na=False), 'company_grouped'] = 'New World'

# update company_grouped = 'Fidélité' where company contains 'Fidélité'
df_movies.loc[df_movies['company'].str.contains('Fidélité', na=False), 'company_grouped'] = 'Fidélité'

# update company_grouped = 'Mandalay' where company contains 'Mandalay'
df_movies.loc[df_movies['company'].str.contains('Mandalay', na=False), 'company_grouped'] = 'Mandalay'

# update company_grouped = 'Twentieth Century Fox' where company contains 'Twentieth Century Fox'
df_movies.loc[df_movies['company'].str.contains('Twentieth Century Fox', na=False), 'company_grouped'] = 'Twentieth Century Fox'

# update company_grouped = 'New Century' where company contains 'New Century'
df_movies.loc[df_movies['company'].str.contains('New Century', na=False), 'company_grouped'] = 'New Century'

# update company_grouped = 'In-Gear' where company contains 'In-Gear'
df_movies.loc[df_movies['company'].str.contains('In-Gear', na=False), 'company_grouped'] = 'In-Gear'

# update company_grouped = 'STX' where company contains 'STX'
df_movies.loc[df_movies['company'].str.contains('STX', na=False), 'company_grouped'] = 'STX'

# update company_grouped = 'Channel Four' where company contains 'Channel Four'
df_movies.loc[df_movies['company'].str.contains('Channel Four', na=False), 'company_grouped'] = 'Channel Four'

# update company_grouped = 'Columbia' where company contains 'Columbia'
df_movies.loc[df_movies['company'].str.contains('Columbia Films', na=False), 'company_grouped'] = 'Columbia Pictures'

# update company_grouped = 'Gaumont' where company contains 'Gaumont'
df_movies.loc[df_movies['company'].str.contains('Gaumont', na=False), 'company_grouped'] = 'Gaumont'

# update company_grouped = 'NBC' where company contains 'NBC'
df_movies.loc[df_movies['company'].str.contains('NBC Film', na=False), 'company_grouped'] = 'NBC Productions'

# update company_grouped = 'Panorama' where company contains 'Panorama'
df_movies.loc[df_movies['company'].str.contains('Panorama', na=False), 'company_grouped'] = 'Panorama'

# update company_grouped = 'BET' where company contains 'BET'
df_movies.loc[df_movies['company'].str.contains('BET Pictures', na=False), 'company_grouped'] = 'BET Films'

# update company_grouped = 'Imagine Entertainment' where company contains 'Imagine Films'
df_movies.loc[df_movies['company'].str.contains('Imagine Films', na=False), 'company_grouped'] = 'Imagine Entertainment'

# update company_grouped = 'New Visions' where company contains 'New Visions'
df_movies.loc[df_movies['company'].str.contains('New Visions', na=False), 'company_grouped'] = 'New Visions'

# update company_grouped = 'ERBP' where company contains 'erbp'
df_movies.loc[df_movies['company'].str.contains('erbp', na=False), 'company_grouped'] = 'ERBP'

similarity_check('company_grouped')


Checking field: company_grouped
Similar: 'C.A.T. Films' and 'R&R Films' with similarity score: 1.00
Similar: 'C.A.T. Films' and 'A&M Films' with similarity score: 1.00
Similar: 'C.A.T. Films' and 'L.A. Films' with similarity score: 1.00
Similar: 'C.A.T. Films' and 'H.I.T. Films' with similarity score: 1.00
Similar: 'C.A.T. Films' and 'C.L.G. Films' with similarity score: 1.00
Similar: 'C.A.T. Films' and '2.4.7. Films' with similarity score: 1.00
Similar: 'Rastar Pictures' and 'Rastar Films' with similarity score: 0.85
Similar: 'Rastar Pictures' and 'Rastar Productions' with similarity score: 0.86
Similar: 'Rastar Films' and 'Rastar Productions' with similarity score: 0.88
Similar: 'Broadway Productions' and 'Broadway Pictures' with similarity score: 0.86
Similar: 'Renaissance Pictures' and 'Renaissance Films' with similarity score: 0.86
Similar: 'R&R Films' and 'A&M Films' with similarity score: 1.00
Similar: 'R&R Films' and 'L.A. Films' with similarity score: 1.00
Similar: 'R&R Films

In [None]:
# to validate the production company changes, get the simliarity scores between company_grouped entries on each record






In [31]:
df_movies.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7658,7659,7660,7661,7662,7663,7664,7665,7666,7667
name,The Shining,The Blue Lagoon,Star Wars: Episode V - The Empire Strikes Back,Airplane!,Caddyshack,Friday the 13th,The Blues Brothers,Raging Bull,Superman II,The Long Riders,...,Black Wall Street Burning,I Am Fear,Aloha Surf Hotel,Love by Drowning,The Robinsons,More to Life,Dream Round,Saving Mbango,It's Just Us,Tee em el
rating,R,R,PG,PG,R,R,R,R,PG,R,...,R,Not Rated,,R,,,,,,
genre,Drama,Adventure,Action,Comedy,Comedy,Horror,Action,Biography,Action,Biography,...,Drama,Horror,Comedy,Drama,Action,Drama,Comedy,Drama,Drama,Horror
year,1980,1980,1980,1980,1980,1980,1980,1980,1980,1980,...,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020
released,"June 13, 1980 (United States)","July 2, 1980 (United States)","June 20, 1980 (United States)","July 2, 1980 (United States)","July 25, 1980 (United States)","May 9, 1980 (United States)","June 20, 1980 (United States)","December 19, 1980 (United States)","June 19, 1981 (United States)","May 16, 1980 (United States)",...,"February 7, 2020 (United States)","March 3, 2020 (United States)","November 5, 2020 (United States)","November 6, 2020 (United States)","November 10, 2020 (United States)","October 23, 2020 (United States)","February 7, 2020 (United States)","April 27, 2020 (Cameroon)","October 1, 2020 (United States)","August 19, 2020 (United States)"
score,8.4,5.8,8.7,7.7,7.3,6.4,7.9,8.2,6.8,7.0,...,6.6,3.4,7.1,,,3.1,4.7,5.7,,5.7
votes,927000.0,65000.0,1200000.0,221000.0,108000.0,123000.0,188000.0,330000.0,101000.0,10000.0,...,35.0,447.0,14.0,,,18.0,36.0,29.0,,7.0
director,Stanley Kubrick,Randal Kleiser,Irvin Kershner,Jim Abrahams,Harold Ramis,Sean S. Cunningham,John Landis,Martin Scorsese,Richard Lester,Walter Hill,...,Marcus Brown,Kevin Shulman,Stefan C. Schaefer,Justin Kreinbrink,Directors,Joseph Ebanks,Dusty Dukatz,Nkanya Nkwai,James Randall,Pereko Mosia
writer,Stephen King,Henry De Vere Stacpoole,Leigh Brackett,Jim Abrahams,Brian Doyle-Murray,Victor Miller,Dan Aykroyd,Jake LaMotta,Jerry Siegel,Bill Bryden,...,Dekoven Riggins,Kevin Shulman,Stefan C. Schaefer,C.E. Poverman,Aleks Alifirenko Jr.,Joseph Ebanks,Lisa Huston,Lynno Lovert,James Randall,Pereko Mosia
star,Jack Nicholson,Brooke Shields,Mark Hamill,Robert Hays,Chevy Chase,Betsy Palmer,John Belushi,Robert De Niro,Gene Hackman,David Carradine,...,Dan Belcher,Kristina Klebe,Augie Tulba,Nicky Whelan,Billy Hartmann,Shannon Bond,Michael Saquella,Onyama Laura,Christina Roz,Siyabonga Mabaso


In [45]:
import gender_guesser.detector as gender

gd = gender.Detector()

# create new field in df_movies called star_gender and populate it with the results of the gender guesser based on the star field in df_movies
df_movies['star_gender'] = df_movies['star'].apply(lambda x: gd.get_gender(x.split(' ')[0]) if pd.notnull(x) else 'unknown')








In [46]:
df_movies[df_movies['star_gender'] == 'unknown'][['star', 'country']].drop_duplicates()


Unnamed: 0,star,country
4,Chevy Chase,United States
11,N!xau,United States
44,Charlton Heston,United States
45,Cheech Marin,United States
65,Dom DeLuise,United States
...,...,...
7652,Zhi-zhong Huang,United States
7655,Guangtao Jiang,United States
7660,Augie Tulba,United States
7665,Onyama Laura,Cameroon


In [None]:
df_movies['star_gender'].value_counts()



star_gender
male             4647
female           1701
mostly_male       526
unknown           477
mostly_female     207
andy              110
Name: count, dtype: int64