In [1]:
import pandas as pd
import numpy as np

#### Dataset: Amazon Prime TV shows and titles.

- The goal of this project is exploratory data analysis to potentially derive interesting and valuable insights about Amazon TV shows and titles.
- Two (2) csv files are available for use in this project and  are derived from the Kaggle (link is provided below)
- Source of data: https://www.kaggle.com/datasets/victorsoeiro/amazon-prime-tv-shows-and-movies?resource=download&select=titles.csv

In [2]:
# Load the csv files Amazon Prime TV shows
tv_credits = pd.read_csv(r'C:\Users\LuisV\Desktop\UDACITY\Portfolio\Amazon Prime TV shows and titles\amazon_project\credits.csv')

In [3]:
# Display the first 5 rows of the dataframe
tv_credits.head(5)

Unnamed: 0,person_id,id,name,character,role
0,59401,ts20945,Joe Besser,Joe,ACTOR
1,31460,ts20945,Moe Howard,Moe,ACTOR
2,31461,ts20945,Larry Fine,Larry,ACTOR
3,21174,tm19248,Buster Keaton,Johnny Gray,ACTOR
4,28713,tm19248,Marion Mack,Annabelle Lee,ACTOR


In [4]:
# Display the number of rows and columns in the dataframe
tv_credits.shape

(124235, 5)

In [5]:
# Create a copy of the dataframe
tv = tv_credits.copy()

In [6]:
# Check for the number of rows and columns in the copy of the dataframe
tv.shape

(124235, 5)

In [7]:
# Check for the total number of duplicate rows in the data
tv.duplicated().sum()

56

- There are 56 rows duplicate rows in the dataframe

In [8]:
# Drop all duplicate rows across all columns
tv = tv.drop_duplicates()

In [9]:
# Check for the number of rows and columns after removing the duplicates
tv.shape

(124179, 5)

- There are 124,179 rows left after removal of duplicate rows across all columns

In [10]:
# Check for the total number of duplicate rows in the dataframe
tv.duplicated().sum()

0

In [11]:
# Check for the number of null values in the dataframe
tv.isnull().sum()

person_id        0
id               0
name             0
character    16277
role             0
dtype: int64

- There are 16,277 empty rows in the Character column, while other columns have no empty entry.

In [12]:
# Fill missing values in the Character column with a forward fill method
tv = tv.fillna(method = 'ffill', axis = 0)

In [13]:
# Check for the number of null values in the dataframe
tv.isnull().sum()

person_id    0
id           0
name         0
character    0
role         0
dtype: int64

- There are no null values left in the dataframe

In [14]:
# Load the csv files Amazon Prime TV shows
titles = pd.read_csv(r'C:\Users\LuisV\Desktop\UDACITY\Portfolio\Amazon Prime TV shows and titles\amazon_project\titles.csv')

In [15]:
# Check to see the number of rows and columns in the dataframe
titles.shape

(9871, 15)

In [16]:
#Check the first few rows of the dataframe for movie titles
titles.head(5)

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts20945,The Three Stooges,SHOW,The Three Stooges were an American vaudeville ...,1934,TV-PG,19,"['comedy', 'family', 'animation', 'action', 'f...",['US'],26.0,tt0850645,8.6,1092.0,15.424,7.6
1,tm19248,The General,MOVIE,"During America’s Civil War, Union spies steal ...",1926,,78,"['action', 'drama', 'war', 'western', 'comedy'...",['US'],,tt0017925,8.2,89766.0,8.647,8.0
2,tm82253,The Best Years of Our Lives,MOVIE,It's the hope that sustains the spirit of ever...,1946,,171,"['romance', 'war', 'drama']",['US'],,tt0036868,8.1,63026.0,8.435,7.8
3,tm83884,His Girl Friday,MOVIE,"Hildy, the journalist former wife of newspaper...",1940,,92,"['comedy', 'drama', 'romance']",['US'],,tt0032599,7.8,57835.0,11.27,7.4
4,tm56584,In a Lonely Place,MOVIE,An aspiring actress begins to suspect that her...,1950,,94,"['thriller', 'drama', 'romance']",['US'],,tt0042593,7.9,30924.0,8.273,7.6


In [17]:
# Print the summary of the dataframe
titles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9871 entries, 0 to 9870
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    9871 non-null   object 
 1   title                 9871 non-null   object 
 2   type                  9871 non-null   object 
 3   description           9752 non-null   object 
 4   release_year          9871 non-null   int64  
 5   age_certification     3384 non-null   object 
 6   runtime               9871 non-null   int64  
 7   genres                9871 non-null   object 
 8   production_countries  9871 non-null   object 
 9   seasons               1357 non-null   float64
 10  imdb_id               9204 non-null   object 
 11  imdb_score            8850 non-null   float64
 12  imdb_votes            8840 non-null   float64
 13  tmdb_popularity       9324 non-null   float64
 14  tmdb_score            7789 non-null   float64
dtypes: float64(5), int64(

In [18]:
# Check for the duplicates in the dataframe
titles.duplicated().sum()

3

- There are 3 duplicate rows in the dataframe

In [19]:
# Drop the duplicate rows in the movie titles dataframe
titles = titles.drop_duplicates()

In [20]:
# Check for the number of duplicates in the dataframe
titles.duplicated().sum()

0

- Duplicate rows have been removed

In [21]:
# Check that there are now 9,868 rows instead of 9,871 since duplicates have been removed
titles.shape

(9868, 15)

In [22]:
# Display the total number of null values for each column in the dataframe
titles.isnull().sum()

id                         0
title                      0
type                       0
description              119
release_year               0
age_certification       6484
runtime                    0
genres                     0
production_countries       0
seasons                 8511
imdb_id                  667
imdb_score              1021
imdb_votes              1031
tmdb_popularity          547
tmdb_score              2080
dtype: int64

- For the seasons column, over 86% of the data is missing and therefore the column will be dropped.

In [54]:
#create a copy of the dataframe
title = titles.copy()

In [55]:
# Check the first few rows of the dataframe
title.head(5)

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts20945,The Three Stooges,SHOW,The Three Stooges were an American vaudeville ...,1934,TV-PG,19,"['comedy', 'family', 'animation', 'action', 'f...",['US'],26.0,tt0850645,8.6,1092.0,15.424,7.6
1,tm19248,The General,MOVIE,"During America’s Civil War, Union spies steal ...",1926,,78,"['action', 'drama', 'war', 'western', 'comedy'...",['US'],,tt0017925,8.2,89766.0,8.647,8.0
2,tm82253,The Best Years of Our Lives,MOVIE,It's the hope that sustains the spirit of ever...,1946,,171,"['romance', 'war', 'drama']",['US'],,tt0036868,8.1,63026.0,8.435,7.8
3,tm83884,His Girl Friday,MOVIE,"Hildy, the journalist former wife of newspaper...",1940,,92,"['comedy', 'drama', 'romance']",['US'],,tt0032599,7.8,57835.0,11.27,7.4
4,tm56584,In a Lonely Place,MOVIE,An aspiring actress begins to suspect that her...,1950,,94,"['thriller', 'drama', 'romance']",['US'],,tt0042593,7.9,30924.0,8.273,7.6


In [43]:
# Check the number of rows and columns in the copy of the dataframe
title.shape

(9868, 15)

In [44]:
# Drop the seasons column from the dataframe and save the result as a new dataframe
no_seasons = title.drop(['seasons'], axis=1)

In [45]:
no_seasons.tail(5)

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
9866,tm510327,Lily Is Here,MOVIE,Dallas and heroin have one thing in common: Du...,2021,,93,['drama'],['US'],tt7672388,5.3,20.0,1.406,
9867,tm1079144,Jay Nog: Something from Nothing,MOVIE,Something From Nothing takes you on a stand-up...,2021,,55,['comedy'],['US'],tt15041600,,,0.6,
9868,tm847725,Chasing,MOVIE,A cop from Chennai sets out to nab a dreaded d...,2021,,116,['crime'],['IN'],,,,1.96,
9869,tm1054116,Baikunth,MOVIE,"This story is about prevalent caste problem, e...",2021,,72,"['family', 'drama']",[],tt14331982,8.4,49.0,0.645,
9870,ts275838,Waking Up Eighty,SHOW,"Kara Stewart, 16, is fed up with just about ev...",2021,,10,['drama'],[],tt13542552,,,,


- The age certification column contains 6,484 null values.This is ~66% of missing data so the column will be dropped.

In [50]:
# Drop the age_certification and imdb_id columns from the dataframe. An 'id' column with unique values already exists
# in the dataframe so the imdb_id column does not necessarily serve as an important column for exploratory data analysis
no_age = no_seasons.drop(['age_certification','imdb_id'], axis=1)   

In [51]:
# Check the first few rows of the dataframe
no_age.head(5)

Unnamed: 0,id,title,type,description,release_year,runtime,genres,production_countries,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts20945,The Three Stooges,SHOW,The Three Stooges were an American vaudeville ...,1934,19,"['comedy', 'family', 'animation', 'action', 'f...",['US'],8.6,1092.0,15.424,7.6
1,tm19248,The General,MOVIE,"During America’s Civil War, Union spies steal ...",1926,78,"['action', 'drama', 'war', 'western', 'comedy'...",['US'],8.2,89766.0,8.647,8.0
2,tm82253,The Best Years of Our Lives,MOVIE,It's the hope that sustains the spirit of ever...,1946,171,"['romance', 'war', 'drama']",['US'],8.1,63026.0,8.435,7.8
3,tm83884,His Girl Friday,MOVIE,"Hildy, the journalist former wife of newspaper...",1940,92,"['comedy', 'drama', 'romance']",['US'],7.8,57835.0,11.27,7.4
4,tm56584,In a Lonely Place,MOVIE,An aspiring actress begins to suspect that her...,1950,94,"['thriller', 'drama', 'romance']",['US'],7.9,30924.0,8.273,7.6


In [52]:
# Display the total number of null values left for each column in the dataframe
no_age.isnull().sum()

id                         0
title                      0
type                       0
description              119
release_year               0
runtime                    0
genres                     0
production_countries       0
imdb_score              1021
imdb_votes              1031
tmdb_popularity          547
tmdb_score              2080
dtype: int64

In [53]:
# Check the last 5 rows of the dataframe
no_age.tail(5)

Unnamed: 0,id,title,type,description,release_year,runtime,genres,production_countries,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
9866,tm510327,Lily Is Here,MOVIE,Dallas and heroin have one thing in common: Du...,2021,93,['drama'],['US'],5.3,20.0,1.406,
9867,tm1079144,Jay Nog: Something from Nothing,MOVIE,Something From Nothing takes you on a stand-up...,2021,55,['comedy'],['US'],,,0.6,
9868,tm847725,Chasing,MOVIE,A cop from Chennai sets out to nab a dreaded d...,2021,116,['crime'],['IN'],,,1.96,
9869,tm1054116,Baikunth,MOVIE,"This story is about prevalent caste problem, e...",2021,72,"['family', 'drama']",[],8.4,49.0,0.645,
9870,ts275838,Waking Up Eighty,SHOW,"Kara Stewart, 16, is fed up with just about ev...",2021,10,['drama'],[],,,,


In [56]:
no_age.shape

(9868, 12)

In [63]:
# Replace null values in the following columns using a forward_filling method: 'tmdb_score','imdb_score', and'imdb_votes'
no_age[['tmdb_score','imdb_score','imdb_votes']]= no_age[['tmdb_score','imdb_score','imdb_votes']].ffill()

In [65]:
# Display the total number of null values left for each column in the dataframe
no_age.isnull().sum()

id                        0
title                     0
type                      0
description             119
release_year              0
runtime                   0
genres                    0
production_countries      0
imdb_score                0
imdb_votes                0
tmdb_popularity         547
tmdb_score                0
dtype: int64

In [67]:
# Replace the null values in the 'tmdb_popularity' column with 0

no_age['tmdb_popularity'] = no_age['tmdb_popularity'].fillna(0)

In [69]:
# Display the total number of null values per column
no_age.isnull().sum()

id                        0
title                     0
type                      0
description             119
release_year              0
runtime                   0
genres                    0
production_countries      0
imdb_score                0
imdb_votes                0
tmdb_popularity           0
tmdb_score                0
dtype: int64