# Part 1

In [59]:

# Your stakeholder only wants you to include information for movies based on the following specifications:

# Exclude any movie with missing values for genre or runtime
# Include only full-length movies (titleType = "movie").
# Include only fictional movies (not from documentary genre)
# Include only movies that were released 2000 - 2021 (include 2000 and 2021)
# Include only movies that were released in the United States

## Imports and Data

In [1]:
#imports

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

In [43]:
# getting urls

basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
akas_url = 'https://datasets.imdbws.com/title.akas.tsv.gz'
ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz' 

In [44]:
#storing urls

basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
akas = pd.read_csv(titles_url, sep ='\t', low_memory = False)
ratings = pd.read_csv(ratings_url, sep = '\t', low_memory = False)


## Basics Data and Cleaning

In [4]:
# Exploring df

basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [5]:
#Checking types

basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9223790 entries, 0 to 9223789
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 633.3+ MB


In [6]:
basics.isna().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            10
dtype: int64

In [7]:
# data dictionary indicates nans replaced with /N, replacing with np.nan
basics.replace({'\\N':np.nan}, inplace = True)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [10]:
#Dropping na columns
basics.dropna(inplace = True) 
basics.isna().sum()

tconst            0
titleType         0
primaryTitle      0
originalTitle     0
isAdult           0
startYear         0
endYear           0
runtimeMinutes    0
genres            0
dtype: int64

In [60]:
#checking for duplicate rows
basics.duplicated().sum()

0

In [65]:
is_movie = basics['titleType'].str.contains('movie', case = False)
basics = basics[is_movie]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
498,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
610,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama
...,...,...,...,...,...,...,...,...,...
9223707,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,\N,100,Documentary
9223712,tt9916692,tvMovie,Teatroteka: Czlowiek bez twarzy,Teatroteka: Czlowiek bez twarzy,0,2015,\N,66,Drama
9223719,tt9916706,movie,Dankyavar Danka,Dankyavar Danka,0,2013,\N,\N,Comedy
9223730,tt9916730,movie,6 Gunn,6 Gunn,0,2017,\N,116,\N


In [27]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
25060,tt0025509,tvSeries,Les Misérables,Les misérables,0,1934,1934,279,Drama
37595,tt0038276,tvSeries,You Are an Artist,You Are an Artist,0,1946,1955,15,Talk-Show
38429,tt0039120,tvSeries,Americana,Americana,0,1947,1949,30,"Family,Game-Show"
38430,tt0039121,tvSeries,Birthday Party,Birthday Party,0,1947,1949,30,Family
38432,tt0039123,tvSeries,Kraft Theatre,Kraft Television Theatre,0,1947,1958,60,Drama
...,...,...,...,...,...,...,...,...,...
9222963,tt9915022,tvSeries,Yarali Kuslar,Yarali Kuslar,0,2019,2019,90,Drama
9222996,tt9915114,tvSeries,Touche pas à mon sport,Touche pas à mon sport,0,2015,2016,65,"Sport,Talk-Show"
9223097,tt9915338,tvSeries,Aunty Donna: Camp Bush Camp!,Aunty Donna: Camp Bush Camp!,0,2018,2018,5,Comedy
9223303,tt9915822,tvSeries,Ichhapyaari Naagin,Ichhapyaari Naagin,0,2016,2017,20,Fantasy


In [40]:
# selecting movies between the years 2000 and 2021
#modern = basics['startYear','endYear'] > 2000
#modern

0

In [18]:
#checking info is consistent
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45329 entries, 25060 to 9223483
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          45329 non-null  object
 1   titleType       45329 non-null  object
 2   primaryTitle    45329 non-null  object
 3   originalTitle   45329 non-null  object
 4   isAdult         45329 non-null  object
 5   startYear       45329 non-null  object
 6   endYear         45329 non-null  object
 7   runtimeMinutes  45329 non-null  object
 8   genres          45329 non-null  object
dtypes: object(9)
memory usage: 3.5+ MB


## AKAS Data and Cleaning

In [30]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [31]:
# data dictionary indicates nans replaced with /N, replacing with np.nan
akas.replace({'\\N':np.nan}, inplace = True)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0


In [32]:
akas.isna().sum()

titleId                   0
ordering                  0
title                     5
region              1862372
language            6252768
types              27901944
attributes         32936783
isOriginalTitle        2187
dtype: int64

In [37]:
akas.dropna(inplace = True)
akas.isna().sum()

titleId            0
ordering           0
title              0
region             0
language           0
types              0
attributes         0
isOriginalTitle    0
dtype: int64

In [34]:
akas.duplicated().sum()

0

In [38]:
akas.nunique()

titleId            435
ordering            38
title              451
region               5
language             4
types                3
attributes          38
isOriginalTitle      1
dtype: int64

In [39]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 453 entries, 100637 to 32705032
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   titleId          453 non-null    object
 1   ordering         453 non-null    int64 
 2   title            453 non-null    object
 3   region           453 non-null    object
 4   language         453 non-null    object
 5   types            453 non-null    object
 6   attributes       453 non-null    object
 7   isOriginalTitle  453 non-null    object
dtypes: int64(1), object(7)
memory usage: 31.9+ KB


In [42]:
#showing only movies made in the use
is_US_Movie = akas['region'].str.contains('US',case=False)
akas = akas[is_US_Movie]
akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
100637,tt0022542,1,Di shtime fun Yisroel,US,yi,alternative,YIVO translation,0
112531,tt0024265,4,Geleb un gelakht,US,yi,alternative,modern translation,0
115906,tt0024751,9,Avram Ovenu,US,yi,alternative,YIVO translation,0
124502,tt0026010,3,Der yidishe Kenigen Lir,US,yi,alternative,YIVO translation,0
137925,tt0027911,1,Libe un Laydnshaft,US,yi,alternative,modern translation,0
145511,tt0028902,4,Freylekhe kabtsonim,US,yi,alternative,YIVO translation,0
145984,tt0028957,4,Grine Felder,US,yi,alternative,YIVO translation,0
146438,tt0029013,3,Di heylige Shvue,US,yi,alternative,YIVO translation,0
146632,tt0029042,3,Ikh vil zayn a Mame,US,yi,alternative,YIVO translation,0
152212,tt0029765,3,Vu iz mayn Kind?,US,yi,alternative,YIVO translation,0


In [47]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =basics['tconst'].isin(akas['titleId'])
keepers

0          True
1          True
2          True
3          True
4          True
           ... 
9223784    True
9223785    True
9223786    True
9223787    True
9223788    True
Name: tconst, Length: 6530389, dtype: bool

## Ratings Data and Cleaning

In [48]:
# Getting overview of the data
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1910
1,tt0000002,5.8,256
2,tt0000003,6.5,1713
3,tt0000004,5.6,169
4,tt0000005,6.2,2527


In [54]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1260427 entries, 0 to 1260426
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1260427 non-null  object 
 1   averageRating  1260427 non-null  float64
 2   numVotes       1260427 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.8+ MB


In [50]:
# data dictionary indicates nans replaced with /N, replacing with np.nan
ratings.replace({'\\N':np.nan}, inplace = True)
ratings.isna().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

In [53]:
# checking duplicates
ratings.duplicated().sum()

0

In [56]:
ratings.describe()

Unnamed: 0,averageRating,numVotes
count,1260427.0,1260427.0
mean,6.906897,1015.711
std,1.392228,17019.09
min,1.0,5.0
25%,6.2,11.0
50%,7.1,25.0
75%,7.8,96.0
max,10.0,2638622.0


In [55]:
# Filter the basics table down to only include the US by using the filter ratings dataframe
keepers2 =basics['tconst'].isin(ratings['tconst'])
keepers2



0           True
1           True
2           True
3           True
4           True
           ...  
9223784    False
9223785    False
9223786    False
9223787    False
9223788    False
Name: tconst, Length: 6530389, dtype: bool

## Saving Data as csv


In [58]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)
akas.to_csv('Data/title.akas.csv.gz',compression = 'gzip', index=False)
ratings.to_csv('Data/title.ratings.csv.gz', compression = 'gzip', index=False)