# Initial Movie Analysis by Wesley Giles

## Load modules and environment variables

In [1]:
import pandas as pd
import numpy as np
import sqlalchemy
import os
from dotenv import load_dotenv
load_dotenv()

True

## Load the data

In [2]:
basics = pd.read_csv("https://datasets.imdbws.com/title.basics.tsv.gz",
                      sep="\t",low_memory = False)
basics.replace({"\\N":np.nan}, inplace = True)
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9163395 entries, 0 to 9163394
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 629.2+ MB


In [3]:
ratings = pd.read_csv("https://datasets.imdbws.com/title.ratings.tsv.gz",
                      sep="\t",low_memory = False)
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1256195 entries, 0 to 1256194
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1256195 non-null  object 
 1   averageRating  1256195 non-null  float64
 2   numVotes       1256195 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.8+ MB


In [4]:
akas = pd.read_csv("https://datasets.imdbws.com/title.akas.tsv.gz",
                      sep="\t",low_memory = False)
akas.replace({"\\N":np.nan}, inplace = True)
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32903294 entries, 0 to 32903293
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.0+ GB


## Now to start the data cleaning acording to insructions

### We will start with the title basics data

#### Remove all the null runtime movies

In [5]:
basics.drop(basics[basics["runtimeMinutes"].isna()].index, inplace = True)
basics["runtimeMinutes"].isna().sum()

0

In [6]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2462756 entries, 0 to 9163394
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 187.9+ MB


#### That filtered out almost 75% of our data. Now let's get only the movies

In [7]:
basics["titleType"].unique()

array(['short', 'movie', 'tvEpisode', 'tvSeries', 'tvShort', 'tvMovie',
       'tvMiniSeries', 'video', 'tvSpecial', 'videoGame'], dtype=object)

In [8]:
basics.drop(basics[basics["titleType"] != "movie"].index, inplace=True)
basics["titleType"].unique()


array(['movie'], dtype=object)

In [9]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 390693 entries, 8 to 9163345
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          390693 non-null  object
 1   titleType       390693 non-null  object
 2   primaryTitle    390693 non-null  object
 3   originalTitle   390693 non-null  object
 4   isAdult         390693 non-null  object
 5   startYear       384735 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  390693 non-null  object
 8   genres          366695 non-null  object
dtypes: object(9)
memory usage: 29.8+ MB


In [10]:
basics["genres"].unique()

array(['Romance', nan, 'Action,Adventure,Biography', ...,
       'Music,Musical,Reality-TV', 'Action,Crime,Short',
       'Crime,Fantasy,Sci-Fi'], dtype=object)

In [11]:
len(basics["genres"].unique())

1381

In [12]:
basics.drop(basics[basics["genres"].isna()].index, inplace = True)
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 366695 entries, 8 to 9163345
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          366695 non-null  object
 1   titleType       366695 non-null  object
 2   primaryTitle    366695 non-null  object
 3   originalTitle   366695 non-null  object
 4   isAdult         366695 non-null  object
 5   startYear       360866 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  366695 non-null  object
 8   genres          366695 non-null  object
dtypes: object(9)
memory usage: 28.0+ MB


In [13]:
basics.drop(basics[basics["genres"].str.contains("Documentary")].index, inplace = True)
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 279298 entries, 8 to 9163244
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          279298 non-null  object
 1   titleType       279298 non-null  object
 2   primaryTitle    279298 non-null  object
 3   originalTitle   279298 non-null  object
 4   isAdult         279298 non-null  object
 5   startYear       275002 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  279298 non-null  object
 8   genres          279298 non-null  object
dtypes: object(9)
memory usage: 21.3+ MB


In [14]:
len(basics["genres"].unique())

1153

#### Only a couple more filters til we have our final dataset. Now to get only modern movies (made between 2000 and 2021, inclusive)

In [15]:
basics["startYear"].unique()

array(['1894', '1906', '1907', '1908', '1910', '1909', '1911', '1913',
       '1912', '1919', '1914', '1915', '1936', '1916', '1917', '1925',
       '1918', '1920', '1921', '1924', '1923', '1922', '1927', '1929',
       '1926', '1993', '1935', '1928', '1942', '1930', '1932', '1931',
       '1937', '1933', '1950', '1938', '1951', '1939', '1934', '1946',
       '1940', '1944', '1947', '1941', '1952', '1970', '1957', '1943',
       '1948', '2001', '1945', '1953', '1954', '1983', '1949', '1973',
       '1961', '1955', '1962', '1958', '1956', '1977', '1964', '1960',
       '1959', '1967', '1965', '1968', '1963', '1971', '1969', '1966',
       '1976', '1990', '1972', '1979', '1981', '2020', '1975', '1978',
       '1989', '1974', '1986', '1987', '1980', '1985', '2018', '1984',
       '1982', '1991', nan, '1988', '2005', '1994', '2004', '2016',
       '1995', '1992', '1998', '2002', '1996', '2017', '1997', '2000',
       '1999', '2006', '2008', '2009', '2007', '2003', '2022', '2012',
       '2

In [16]:
basics.drop(basics[basics["startYear"].isna()].index, inplace = True)
basics[["startYear"]].astype(int).describe()

Unnamed: 0,startYear
count,275002.0
mean,1990.868292
std,28.217736
min,1894.0
25%,1972.0
50%,2001.0
75%,2014.0
max,2027.0


In [17]:
basics.drop(basics[~basics["startYear"].astype(int).isin(range(2000,2022))].index, inplace = True)
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136695 entries, 34790 to 9163244
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          136695 non-null  object
 1   titleType       136695 non-null  object
 2   primaryTitle    136695 non-null  object
 3   originalTitle   136695 non-null  object
 4   isAdult         136695 non-null  object
 5   startYear       136695 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  136695 non-null  object
 8   genres          136695 non-null  object
dtypes: object(9)
memory usage: 10.4+ MB


In [18]:
basics[["startYear"]].astype(int).describe()

Unnamed: 0,startYear
count,136695.0
mean,2012.74506
std,5.732133
min,2000.0
25%,2009.0
50%,2014.0
75%,2018.0
max,2021.0


#### Finally let's limit the movies to those released in the United States

In [19]:
akas["region"].unique()

array(['UA', 'DE', 'HU', 'GR', 'RU', 'US', nan, 'JP', 'FR', 'RO', 'GB',
       'CA', 'PT', 'ES', 'FI', 'PL', 'AR', 'RS', 'UY', 'IT', 'BR', 'DK',
       'TR', 'XWW', 'XEU', 'SK', 'CZ', 'SE', 'NZ', 'MX', 'NO', 'XYU',
       'AT', 'VE', 'CSHH', 'SI', 'IN', 'AU', 'TW', 'LT', 'NL', 'CO', 'IR',
       'BG', 'SG', 'BE', 'SUHH', 'HR', 'DZ', 'CH', 'BF', 'PH', 'XWG',
       'VN', 'CN', 'XSA', 'EE', 'IS', 'PR', 'DDDE', 'HK', 'XKO', 'CL',
       'IE', 'JM', 'PE', 'EG', 'GE', 'BY', 'BA', 'AE', 'PA', 'TJ', 'XSI',
       'TH', 'YUCS', 'ZA', 'MY', 'IL', 'LV', 'PK', 'KR', 'BD', 'ID', 'CU',
       'AL', 'BO', 'XAS', 'CR', 'PY', 'DO', 'GT', 'EC', 'SV', 'UZ',
       'BUMM', 'XPI', 'BJ', 'AZ', 'NG', 'CM', 'MA', 'GL', 'MN', 'LI',
       'LU', 'MZ', 'BM', 'KZ', 'MD', 'LB', 'IQ', 'TM', 'MK', 'TN', 'HT',
       'AM', 'LK', 'ME', 'CG', 'CI', 'SY', 'NP', 'QA', 'TO', 'SN', 'GH',
       'JO', 'KG', 'NE', 'GN', 'VDVN', 'TD', 'SO', 'SD', 'MC', 'TT', 'GA',
       'BS', 'LY', 'AO', 'KH', 'MR', 'AF', 'MG', 'ML', 'GY', 

In [20]:
us_title_ids = akas[akas["region"] == "US"]["titleId"]
us_title_ids

5           tt0000001
14          tt0000002
33          tt0000005
36          tt0000005
41          tt0000005
              ...    
32902966    tt9916702
32903003    tt9916720
32903005    tt9916720
32903022    tt9916756
32903038    tt9916764
Name: titleId, Length: 1343583, dtype: object

In [21]:
basics.drop(basics[~basics["tconst"].isin(us_title_ids)].index, inplace = True)
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 79584 entries, 34790 to 9163160
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          79584 non-null  object
 1   titleType       79584 non-null  object
 2   primaryTitle    79584 non-null  object
 3   originalTitle   79584 non-null  object
 4   isAdult         79584 non-null  object
 5   startYear       79584 non-null  object
 6   endYear         0 non-null      object
 7   runtimeMinutes  79584 non-null  object
 8   genres          79584 non-null  object
dtypes: object(9)
memory usage: 6.1+ MB


### Looks like all the filters have been applied and there are 79544 titles that fit our criteria. Let's filter down the ratings and akas in order to get rid of unnecessary data

In [22]:
akas.drop(akas[~akas["titleId"].isin(basics["tconst"])].index, inplace = True)
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 703835 entries, 194989 to 32902553
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   titleId          703835 non-null  object
 1   ordering         703835 non-null  int64 
 2   title            703835 non-null  object
 3   region           617839 non-null  object
 4   language         154608 non-null  object
 5   types            644368 non-null  object
 6   attributes       23837 non-null   object
 7   isOriginalTitle  703835 non-null  object
dtypes: int64(1), object(7)
memory usage: 48.3+ MB


In [23]:
ratings.drop(ratings[~ratings["tconst"].isin(basics["tconst"])].index, inplace = True)
ratings.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 65949 entries, 17858 to 1256173
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         65949 non-null  object 
 1   averageRating  65949 non-null  float64
 2   numVotes       65949 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 2.0+ MB


### Now to just save the data to compressed CSVs

In [24]:
os.makedirs("data/", exist_ok= True)
basics.to_csv("data/title_basics.csv.gz",compression='gzip',index=False)
akas.to_csv("data/title_akas.csv.gz",compression='gzip',index=False)
ratings.to_csv("data/title_ratings.csv.gz",compression='gzip',index=False)
os.listdir("data/")

['title_akas.csv.gz', 'title_basics.csv.gz', 'title_ratings.csv.gz']