In [1]:
# Import dependancies
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Read in raw csv
df = pd.read_csv('Data/unprocessed_data.csv')

# Check to make sure it imported
df.head(1)

Unnamed: 0,id,Title,Movie Link,Year,Duration,MPA,Rating,Votes,budget,grossWorldWide,...,writers,stars,genres,countries_origin,filming_locations,production_companies,Languages,wins,nominations,oscars
0,tt0073195,Jaws,https://www.imdb.com/title/tt0073195,1975,2h 4m,PG,8.1,683K,7000000.0,477220580.0,...,"['Peter Benchley', 'Carl Gottlieb']","['Roy Scheider', 'Robert Shaw', 'Richard Dreyf...","['Monster Horror', 'Sea Adventure', 'Survival'...",['United States'],"[""Water Street, Edgartown, Martha's Vineyard, ...","['Zanuck/Brown Productions', 'Universal Pictur...",['English'],0,20,0


In [3]:
# Check Data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33600 entries, 0 to 33599
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     33600 non-null  object 
 1   Title                  33600 non-null  object 
 2   Movie Link             33600 non-null  object 
 3   Year                   33600 non-null  int64  
 4   Duration               33379 non-null  object 
 5   MPA                    25624 non-null  object 
 6   Rating                 33462 non-null  float64
 7   Votes                  33462 non-null  object 
 8   budget                 11815 non-null  float64
 9   grossWorldWide         18222 non-null  float64
 10  gross_US_Canada        17571 non-null  float64
 11  opening_weekend_Gross  15523 non-null  float64
 12  directors              33241 non-null  object 
 13  writers                32024 non-null  object 
 14  stars                  33127 non-null  object 
 15  ge

In [4]:
# Remove unnecessary columns from df
df_step_1 = df[[
    'id',
    'Title',
    'Year',
    'Rating',
    'Votes',
    'budget',
    'grossWorldWide',
    'gross_US_Canada',
    'opening_weekend_Gross',
    'genres',
    'production_companies',
    'nominations',
    'oscars'
]]

df_step_1.head()

Unnamed: 0,id,Title,Year,Rating,Votes,budget,grossWorldWide,gross_US_Canada,opening_weekend_Gross,genres,production_companies,nominations,oscars
0,tt0073195,Jaws,1975,8.1,683K,7000000.0,477220580.0,266567580.0,7061513.0,"['Monster Horror', 'Sea Adventure', 'Survival'...","['Zanuck/Brown Productions', 'Universal Pictur...",20,0
1,tt0073629,The Rocky Horror Picture Show,1975,7.4,173K,1200000.0,115798478.0,112892319.0,,"['Dark Comedy', 'Raunchy Comedy', 'Rock Musica...","['Twentieth Century Fox', 'Michael White Produ...",4,0
2,tt0073486,One Flew Over the Cuckoo's Nest,1975,8.7,1.1M,3000000.0,109115366.0,108981275.0,,"['Medical Drama', 'Psychological Drama', 'Drama']","['Fantasy Films', 'N.V. Zvaluw']",15,0
3,tt0072890,Dog Day Afternoon,1975,8.0,279K,1800000.0,50002721.0,50000000.0,,"['Heist', 'True Crime', 'Biography', 'Crime', ...","['Warner Bros.', 'Artists Entertainment Complex']",20,0
4,tt0073692,Shampoo,1975,6.4,15K,4000000.0,49407734.0,49407734.0,,"['Satire', 'Comedy', 'Drama']","['Persky-Bright / Vista', 'Columbia Pictures',...",11,0


In [5]:
# Update names for clarity
df_step_2 = df_step_1.rename(columns= {
    'Title': 'title',
    'Year': 'year',
    'Rating': 'rating',
    'Votes': 'votes',
    'grossWorldWide': 'gross_world_wide',
    'gross_US_Canada': 'gross_us_canada',
    'opening_weekend_Gross': 'opening_weekend_gross'
})

In [6]:
# Filter dataset for all movies after the year 2005
df_step_3 = df_step_2[df_step_2['year'] >= 2005].reset_index()
df_step_3.head()

Unnamed: 0,index,id,title,year,rating,votes,budget,gross_world_wide,gross_us_canada,opening_weekend_gross,genres,production_companies,nominations,oscars
0,2000,tt1502397,Bad Boys for Life,2020,6.5,187K,90000000.0,426505244.0,206305244.0,62504105.0,"['Buddy Cop', 'Cop Drama', 'Action', 'Comedy',...","['Columbia Pictures', '2.0 Entertainment', 'Do...",15,0
1,2001,tt8332922,A Quiet Place Part II,2020,7.2,296K,61000000.0,297372261.0,160072261.0,47547231.0,"['Alien Invasion', 'Monster Horror', 'Supernat...","['Paramount Pictures', 'Platinum Dunes', 'Sund...",44,1
2,2002,tt3794354,Sonic the Hedgehog,2020,6.5,171K,85000000.0,319715683.0,148974665.0,58018348.0,"['Animal Adventure', 'Buddy Comedy', 'Road Tri...","['Paramount Pictures', 'Sega Sammy Group', 'Or...",12,0
3,2003,tt7713068,Birds of Prey and the Fantabulous Emancipation...,2020,6.1,271K,84500000.0,205537933.0,84172791.0,33010017.0,"['Dark Comedy', 'Superhero', 'Action', 'Comedy...","['Clubhouse Pictures (II)', 'DC Entertainment'...",83,0
4,2004,tt6673612,Dolittle,2020,5.6,73K,175000000.0,251410631.0,77047065.0,21844045.0,"['Animal Adventure', 'Quest', 'Adventure', 'Co...","['Universal Pictures', 'Perfect World Pictures...",9,0


In [7]:
# Drop all rows with missing vlaues
cleaned_df = df_step_3.dropna()
cleaned_df.head()

Unnamed: 0,index,id,title,year,rating,votes,budget,gross_world_wide,gross_us_canada,opening_weekend_gross,genres,production_companies,nominations,oscars
0,2000,tt1502397,Bad Boys for Life,2020,6.5,187K,90000000.0,426505244.0,206305244.0,62504105.0,"['Buddy Cop', 'Cop Drama', 'Action', 'Comedy',...","['Columbia Pictures', '2.0 Entertainment', 'Do...",15,0
1,2001,tt8332922,A Quiet Place Part II,2020,7.2,296K,61000000.0,297372261.0,160072261.0,47547231.0,"['Alien Invasion', 'Monster Horror', 'Supernat...","['Paramount Pictures', 'Platinum Dunes', 'Sund...",44,1
2,2002,tt3794354,Sonic the Hedgehog,2020,6.5,171K,85000000.0,319715683.0,148974665.0,58018348.0,"['Animal Adventure', 'Buddy Comedy', 'Road Tri...","['Paramount Pictures', 'Sega Sammy Group', 'Or...",12,0
3,2003,tt7713068,Birds of Prey and the Fantabulous Emancipation...,2020,6.1,271K,84500000.0,205537933.0,84172791.0,33010017.0,"['Dark Comedy', 'Superhero', 'Action', 'Comedy...","['Clubhouse Pictures (II)', 'DC Entertainment'...",83,0
4,2004,tt6673612,Dolittle,2020,5.6,73K,175000000.0,251410631.0,77047065.0,21844045.0,"['Animal Adventure', 'Quest', 'Adventure', 'Co...","['Universal Pictures', 'Perfect World Pictures...",9,0


In [8]:
# Check the data
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4378 entries, 0 to 10687
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   index                  4378 non-null   int64  
 1   id                     4378 non-null   object 
 2   title                  4378 non-null   object 
 3   year                   4378 non-null   int64  
 4   rating                 4378 non-null   float64
 5   votes                  4378 non-null   object 
 6   budget                 4378 non-null   float64
 7   gross_world_wide       4378 non-null   float64
 8   gross_us_canada        4378 non-null   float64
 9   opening_weekend_gross  4378 non-null   float64
 10  genres                 4378 non-null   object 
 11  production_companies   4378 non-null   object 
 12  nominations            4378 non-null   int64  
 13  oscars                 4378 non-null   int64  
dtypes: float64(5), int64(4), object(5)
memory usage: 513.0+ KB


In [11]:
cleaned_df.to_csv('Data/processed_data.csv', index=False)