# Part 1


* Your stakeholder only wants you to include information for movies based on the following specifications:

* Exclude any movie with missing values for genre or runtime

* Include only full-length movies (titleType = "movie").

* Include only fictional movies (not from documentary genre)

* Include only movies that were released 2000 - 2021 (include 2000 and 2021)

* Include only movies that were released in the United States

## Imports and Data

In [1]:
#imports

import pandas as pd 
import numpy as np
from sklearn.linear_model import LinearRegression

In [2]:
# getting urls

basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
akas_url = 'https://datasets.imdbws.com/title.akas.tsv.gz'
ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz' 

In [3]:
#storing urls

basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
akas = pd.read_csv(akas_url, sep ='\t', low_memory = False)
ratings = pd.read_csv(ratings_url, sep = '\t', low_memory = False)


## Basics Data and Cleaning

In [4]:
# Exploring df

basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [5]:
#Checking types

basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9253132 entries, 0 to 9253131
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 635.4+ MB


In [6]:
# data dictionary indicates nans replaced with /N, replacing with np.nan
basics.replace({'\\N':np.nan}, inplace = True)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [7]:
# checking missing values
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1234419
endYear           9156588
runtimeMinutes    6775044
genres             427815
dtype: int64

In [8]:
# almost all of endYear is missing so i;m going to drop it
basics = basics.drop(columns = 'endYear')
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...
9253127,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2010,,"Action,Drama,Family"
9253128,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,,"Action,Drama,Family"
9253129,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,,"Action,Drama,Family"
9253130,tt9916856,short,The Wind,The Wind,0,2015,27,Short


In [9]:
#Dropping na columns
basics.dropna(inplace = True) 
basics.isna().sum()

tconst            0
titleType         0
primaryTitle      0
originalTitle     0
isAdult           0
startYear         0
runtimeMinutes    0
genres            0
dtype: int64

In [10]:
#checking for duplicate rows
basics.duplicated().sum()

0

In [11]:
# looking for movies
is_movie = basics['titleType'] == 'movie'
is_movie

0          False
1          False
2          False
3          False
4          False
           ...  
9253082     True
9253088    False
9253123    False
9253130    False
9253131    False
Name: titleType, Length: 2374166, dtype: bool

In [12]:
#Changing start year dtype from object to int 
basics['startYear'] = basics['startYear'].astype(int)
#filtering out movies earlier than 2000
basics = basics.loc[(basics['startYear'] >=2000) & (basics['startYear'] <=2022) & is_movie]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
13079,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021,133,Documentary
34792,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,118,"Comedy,Fantasy,Romance"
61094,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,70,Drama
66309,tt0067683,movie,Workers '71: Nothing About Us Without Us,Robotnicy 1971 - Nic o nas bez nas,0,2006,47,Documentary
67640,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,122,Drama
...,...,...,...,...,...,...,...,...
9252897,tt9916362,movie,Coven,Akelarre,0,2020,92,"Drama,History"
9252981,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019,123,Drama
9253022,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,57,Documentary
9253049,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,100,Documentary


In [13]:
# get data type of column 'startYear'
dataType = basics.dtypes['startYear']
print('Data type of each column startYear in the Dataframe :')
print(dataType)

Data type of each column startYear in the Dataframe :
int32


In [14]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
34792,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,118,"Comedy,Fantasy,Romance"
61094,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,70,Drama
67640,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,122,Drama
77934,tt0079644,movie,November 1828,November 1828,0,2001,140,"Drama,War"
86771,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...
9252804,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,51,Drama
9252813,tt9916190,movie,Safeguard,Safeguard,0,2020,95,"Action,Adventure,Thriller"
9252852,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,84,Thriller
9252897,tt9916362,movie,Coven,Akelarre,0,2020,92,"Drama,History"


In [15]:
#checking info is consistent
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 143291 entries, 34792 to 9252981
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          143291 non-null  object
 1   titleType       143291 non-null  object
 2   primaryTitle    143291 non-null  object
 3   originalTitle   143291 non-null  object
 4   isAdult         143291 non-null  object
 5   startYear       143291 non-null  int32 
 6   runtimeMinutes  143291 non-null  object
 7   genres          143291 non-null  object
dtypes: int32(1), object(7)
memory usage: 9.3+ MB


## AKAS Data and Cleaning

In [16]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [17]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33301661 entries, 0 to 33301660
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.0+ GB


In [18]:
# data dictionary indicates nans replaced with /N, replacing with np.nan
akas.replace({'\\N':np.nan}, inplace = True)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0


In [19]:
akas.isna().sum()

titleId                   0
ordering                  0
title                     5
region              1863293
language            6271740
types              28005451
attributes         33054295
isOriginalTitle        2187
dtype: int64

In [20]:
# dropping remainder of nans
akas.dropna(inplace = True)
akas.isna().sum()

titleId            0
ordering           0
title              0
region             0
language           0
types              0
attributes         0
isOriginalTitle    0
dtype: int64

In [21]:
#checking for duplicates
akas.duplicated().sum()

0

In [22]:
# checking unique values
akas.nunique()

titleId            435
ordering            38
title              451
region               5
language             4
types                3
attributes          38
isOriginalTitle      1
dtype: int64

In [23]:
#showing only movies made in the usa
is_US_Movie = akas['region'].str.contains('US',case=False)
akas = akas[is_US_Movie]
akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
100776,tt0022542,1,Di shtime fun Yisroel,US,yi,alternative,YIVO translation,0
112683,tt0024265,4,Geleb un gelakht,US,yi,alternative,modern translation,0
116064,tt0024751,9,Avram Ovenu,US,yi,alternative,YIVO translation,0
124665,tt0026010,3,Der yidishe Kenigen Lir,US,yi,alternative,YIVO translation,0
138101,tt0027911,1,Libe un Laydnshaft,US,yi,alternative,modern translation,0
145702,tt0028902,4,Freylekhe kabtsonim,US,yi,alternative,YIVO translation,0
146176,tt0028957,4,Grine Felder,US,yi,alternative,YIVO translation,0
146631,tt0029013,3,Di heylige Shvue,US,yi,alternative,YIVO translation,0
146825,tt0029042,3,Ikh vil zayn a Mame,US,yi,alternative,YIVO translation,0
152411,tt0029765,3,Vu iz mayn Kind?,US,yi,alternative,YIVO translation,0


In [24]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =basics['tconst'].isin(akas['titleId'])
basics



Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
34792,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,118,"Comedy,Fantasy,Romance"
61094,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,70,Drama
67640,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,122,Drama
77934,tt0079644,movie,November 1828,November 1828,0,2001,140,"Drama,War"
86771,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...
9252804,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,51,Drama
9252813,tt9916190,movie,Safeguard,Safeguard,0,2020,95,"Action,Adventure,Thriller"
9252852,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,84,Thriller
9252897,tt9916362,movie,Coven,Akelarre,0,2020,92,"Drama,History"


In [25]:
#checking data is consistent
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22 entries, 100776 to 1027479
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   titleId          22 non-null     object
 1   ordering         22 non-null     int64 
 2   title            22 non-null     object
 3   region           22 non-null     object
 4   language         22 non-null     object
 5   types            22 non-null     object
 6   attributes       22 non-null     object
 7   isOriginalTitle  22 non-null     object
dtypes: int64(1), object(7)
memory usage: 1.5+ KB


## Ratings Data and Cleaning

In [26]:
# Getting overview of the data
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1913
1,tt0000002,5.8,258
2,tt0000003,6.5,1717
3,tt0000004,5.6,170
4,tt0000005,6.2,2533


In [27]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1261565 entries, 0 to 1261564
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1261565 non-null  object 
 1   averageRating  1261565 non-null  float64
 2   numVotes       1261565 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.9+ MB


In [28]:
# data dictionary indicates nans replaced with /N, replacing with np.nan
ratings.replace({'\\N':np.nan}, inplace = True)
ratings.isna().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

In [29]:
# checking duplicates
ratings.duplicated().sum()

0

In [30]:
# Filter the basics table down to only include the US by using the filter ratings dataframe
keepers2 =basics['tconst'].isin(ratings['tconst'])
basics





Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
34792,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,118,"Comedy,Fantasy,Romance"
61094,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,70,Drama
67640,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,122,Drama
77934,tt0079644,movie,November 1828,November 1828,0,2001,140,"Drama,War"
86771,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...
9252804,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,51,Drama
9252813,tt9916190,movie,Safeguard,Safeguard,0,2020,95,"Action,Adventure,Thriller"
9252852,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,84,Thriller
9252897,tt9916362,movie,Coven,Akelarre,0,2020,92,"Drama,History"


## Saving Data as csv


In [31]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)
akas.to_csv('Data/title.akas.csv.gz',compression = 'gzip', index=False)
ratings.to_csv('Data/title.ratings.csv.gz', compression = 'gzip', index=False)