# Project 2

## Imports

In [1]:
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine
import numpy as np
pd.set_option('display.max_columns', 50)

In [2]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 

In [3]:
# Confirm folder was created and files added successfully
os.listdir("Data/")

['.ipynb_checkpoints',
 'IMDB Movie Dataset Info.docx',
 'title-akas-us-only.csv',
 'title.basics-cleaned.tsv.gz',
 'title.basics.tsv.gz',
 'title.ratings-cleaned.tsv.gz',
 'title.ratings.tsv.gz']

## Loading files

## Title akas us only

In [4]:
akas = pd.read_csv('Data/title-akas-us-only.csv')

  akas = pd.read_csv('Data/title-akas-us-only.csv')


In [5]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
1,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
3,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
4,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0


## Title Basics

In [6]:
basics = pd.read_csv('Data/title.basics.tsv.gz', sep='\t', low_memory=False)

In [7]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [8]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10017011 entries, 0 to 10017010
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 687.8+ MB


In [9]:
basics.dtypes

tconst            object
titleType         object
primaryTitle      object
originalTitle     object
isAdult           object
startYear         object
endYear           object
runtimeMinutes    object
genres            object
dtype: object

In [10]:
ratings = pd.read_csv('Data/title.ratings.tsv.gz', sep='\t', low_memory=False)

In [11]:
ratings.dtypes

tconst            object
averageRating    float64
numVotes           int64
dtype: object

### Preprocessing

#### Removing all Non US Movies

In [12]:
# Filter the basics table down to only include the US by using the filter akas dataframe
filter_us_titles = basics['tconst'].isin(akas['titleId'])
basics = basics[filter_us_titles]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"
...,...,...,...,...,...,...,...,...,...
10016872,tt9916560,tvMovie,March of Dimes Presents: Once Upon a Dime,March of Dimes Presents: Once Upon a Dime,0,1963,\N,58,Family
10016901,tt9916620,movie,The Copeland Case,The Copeland Case,0,\N,\N,\N,Drama
10016939,tt9916702,short,Loving London: The Playground,Loving London: The Playground,0,\N,\N,\N,"Drama,Short"
10016962,tt9916756,short,Pretty Pretty Black Girl,Pretty Pretty Black Girl,0,2019,\N,\N,Short


#### Handling Null Values

In [13]:
# Filling in known null values
basics = basics.replace({'\\N':np.nan})

In [14]:
# Removing rows with null values in the runtimeMinutes or genres columns only
basics = basics.dropna(subset = ['runtimeMinutes', 'genres'])

In [15]:
# Verifying the change
basics['genres'].isna().sum()

0

In [16]:
# Verifying the change
basics['runtimeMinutes'].isna().sum()

0

#### Removing Everything That Isn't a Movie

In [17]:
# Filter the basics table down to only include the US by using the filter akas dataframe
filter_title_type = basics['titleType'] == 'movie'
basics = basics[filter_title_type]
(basics['titleType'] != 'movie').sum()

0

#### Changing Data Types

In [18]:
# conver the column to a float
basics['startYear'] = basics['startYear'].astype(float)
# Confirm the data type of the column
basics['startYear'].dtype

dtype('float64')

#### Keeping only movies made between 2000 - 2022

In [19]:
# A Fitler that only keeps movies made between the years 2000 - 2022
filter_years = (basics['startYear'] >= 2000) & (basics['startYear'] <= 2022)
# Applying filter
basics = basics[filter_years]
# Verifying
((basics['startYear'] < 2000) | (basics['startYear'] > 2022)).sum()

0

#### Eliminate movies that include "Documentary" in genre:

In [20]:
# A filter that only includes Documentaries
filter_documentaries = basics['genres'].str.contains('Documentary')
# Removing all documentaries
basics = basics[~filter_documentaries]
# Verifying
basics['genres'].str.contains('Documentary').sum()

0

In [21]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61114,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67666,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86793,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
93930,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


#### Saving the final DF

In [22]:
# Saving the final dataframe to Google Drive
fpath_out = "Data/title.basics-cleaned.tsv.gz"
basics.to_csv(fpath_out, index=True)

### Title Ratings

In [23]:
ratings = pd.read_csv('Data/title.ratings.tsv.gz', sep='\t', low_memory=False)

In [24]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1988
1,tt0000002,5.8,265
2,tt0000003,6.5,1849
3,tt0000004,5.5,178
4,tt0000005,6.2,2632


In [27]:
filter_basics = ratings['tconst'].isin(basics['tconst'])
ratings = ratings[filter_basics]
ratings

Unnamed: 0,tconst,averageRating,numVotes
17961,tt0035423,6.4,87153
40764,tt0062336,6.4,175
46645,tt0069049,6.7,7754
63640,tt0088751,5.2,336
69953,tt0096056,5.6,846
...,...,...,...
1331411,tt9914942,6.6,178
1331437,tt9915872,6.4,9
1331450,tt9916170,7.0,7
1331451,tt9916190,3.7,243


#### Saving the DF 

In [29]:
# Saving the final dataframe to Google Drive
fpath_out = "Data/title.ratings-cleaned.tsv.gz"
ratings.to_csv(fpath_out, index=True)