In [2]:
import pandas as pd
import numpy as np

#### Dataset: Amazon Prime TV shows and titles.

- The goal of this project is exploratory data analysis to potentially derive interesting and valuable insights about Amazon TV shows and titles.
- Two (2) csv files are available for use in this project and  are derived from the Kaggle (link is provided below)
- Source of data: https://www.kaggle.com/datasets/victorsoeiro/amazon-prime-tv-shows-and-movies?resource=download&select=titles.csv

In [3]:
# Load the csv files Amazon Prime TV shows
tv_credits = pd.read_csv(r'C:\Users\LuisV\Desktop\UDACITY\Portfolio\Amazon Prime TV shows and titles\amazon_project\credits.csv')

In [4]:
# Display the first 5 rows of the dataframe
tv_credits.head(5)

Unnamed: 0,person_id,id,name,character,role
0,59401,ts20945,Joe Besser,Joe,ACTOR
1,31460,ts20945,Moe Howard,Moe,ACTOR
2,31461,ts20945,Larry Fine,Larry,ACTOR
3,21174,tm19248,Buster Keaton,Johnny Gray,ACTOR
4,28713,tm19248,Marion Mack,Annabelle Lee,ACTOR


In [5]:
# Display the number of rows and columns in the dataframe
tv_credits.shape

(124235, 5)

In [6]:
# Create a copy of the dataframe
tv = tv_credits.copy()

In [7]:
# Check for the number of rows and columns in the copy of the dataframe
tv.shape

(124235, 5)

In [8]:
# Check for the total number of duplicate rows in the data
tv.duplicated().sum()

56

- There are 56 rows duplicate rows in the dataframe

In [9]:
# Drop all duplicate rows across all columns
tv = tv.drop_duplicates()

In [10]:
# Check for the number of rows and columns after removing the duplicates
tv.shape

(124179, 5)

- There are 124,179 rows left after removal of duplicate rows across all columns

In [11]:
# Check for the total number of duplicate rows in the dataframe
tv.duplicated().sum()

0

In [12]:
# Check for the number of null values in the dataframe
tv.isnull().sum()

person_id        0
id               0
name             0
character    16277
role             0
dtype: int64

- There are 16,277 empty rows in the Character column, while other columns have no empty entry.

In [13]:
# Fill missing values in the Character column with a forward fill method
tv = tv.fillna(method = 'ffill', axis = 0)

In [14]:
# Check for the number of null values in the dataframe
tv.isnull().sum()

person_id    0
id           0
name         0
character    0
role         0
dtype: int64

- There are no null values left in the dataframe

In [15]:
# Load the csv files Amazon Prime TV shows
titles = pd.read_csv(r'C:\Users\LuisV\Desktop\UDACITY\Portfolio\Amazon Prime TV shows and titles\amazon_project\titles.csv')

In [17]:
# Check to see the number of rows and columns in the dataframe
titles.shape

(9871, 15)

In [16]:
#Check the first few rows of the dataframe for movie titles
titles.head(5)

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts20945,The Three Stooges,SHOW,The Three Stooges were an American vaudeville ...,1934,TV-PG,19,"['comedy', 'family', 'animation', 'action', 'f...",['US'],26.0,tt0850645,8.6,1092.0,15.424,7.6
1,tm19248,The General,MOVIE,"During America’s Civil War, Union spies steal ...",1926,,78,"['action', 'drama', 'war', 'western', 'comedy'...",['US'],,tt0017925,8.2,89766.0,8.647,8.0
2,tm82253,The Best Years of Our Lives,MOVIE,It's the hope that sustains the spirit of ever...,1946,,171,"['romance', 'war', 'drama']",['US'],,tt0036868,8.1,63026.0,8.435,7.8
3,tm83884,His Girl Friday,MOVIE,"Hildy, the journalist former wife of newspaper...",1940,,92,"['comedy', 'drama', 'romance']",['US'],,tt0032599,7.8,57835.0,11.27,7.4
4,tm56584,In a Lonely Place,MOVIE,An aspiring actress begins to suspect that her...,1950,,94,"['thriller', 'drama', 'romance']",['US'],,tt0042593,7.9,30924.0,8.273,7.6


In [22]:
# Print the summary of the dataframe
titles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9871 entries, 0 to 9870
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    9871 non-null   object 
 1   title                 9871 non-null   object 
 2   type                  9871 non-null   object 
 3   description           9752 non-null   object 
 4   release_year          9871 non-null   int64  
 5   age_certification     3384 non-null   object 
 6   runtime               9871 non-null   int64  
 7   genres                9871 non-null   object 
 8   production_countries  9871 non-null   object 
 9   seasons               1357 non-null   float64
 10  imdb_id               9204 non-null   object 
 11  imdb_score            8850 non-null   float64
 12  imdb_votes            8840 non-null   float64
 13  tmdb_popularity       9324 non-null   float64
 14  tmdb_score            7789 non-null   float64
dtypes: float64(5), int64(

In [24]:
# Check for the duplicates in the dataframe
titles.duplicated().sum()

3

- There are 3 duplicate rows in the dataframe

In [25]:
# Drop the duplicate rows in the movie titles dataframe
titles = titles.drop_duplicates()

In [26]:
# Check for the number of duplicates in the dataframe
titles.duplicated().sum()

0

- Duplicate rows have been removed

In [27]:
# Check that there are now 9,868 rows instead of 9,871 since duplicates have been removed
titles.shape

(9868, 15)

In [23]:
# Display the total number of null values for each column in the dataframe
titles.isnull().sum()

id                         0
title                      0
type                       0
description              119
release_year               0
age_certification       6487
runtime                    0
genres                     0
production_countries       0
seasons                 8514
imdb_id                  667
imdb_score              1021
imdb_votes              1031
tmdb_popularity          547
tmdb_score              2082
dtype: int64

- For the seasons column, over 86% of the data is missing and therefore the column will be dropped.

In [28]:
#create a copy of the dataframe
title = titles.copy()

In [32]:
# Check the first few rows of the dataframe
title.head(5)

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts20945,The Three Stooges,SHOW,The Three Stooges were an American vaudeville ...,1934,TV-PG,19,"['comedy', 'family', 'animation', 'action', 'f...",['US'],26.0,tt0850645,8.6,1092.0,15.424,7.6
1,tm19248,The General,MOVIE,"During America’s Civil War, Union spies steal ...",1926,,78,"['action', 'drama', 'war', 'western', 'comedy'...",['US'],,tt0017925,8.2,89766.0,8.647,8.0
2,tm82253,The Best Years of Our Lives,MOVIE,It's the hope that sustains the spirit of ever...,1946,,171,"['romance', 'war', 'drama']",['US'],,tt0036868,8.1,63026.0,8.435,7.8
3,tm83884,His Girl Friday,MOVIE,"Hildy, the journalist former wife of newspaper...",1940,,92,"['comedy', 'drama', 'romance']",['US'],,tt0032599,7.8,57835.0,11.27,7.4
4,tm56584,In a Lonely Place,MOVIE,An aspiring actress begins to suspect that her...,1950,,94,"['thriller', 'drama', 'romance']",['US'],,tt0042593,7.9,30924.0,8.273,7.6


In [29]:
# Check the number of rows and columns in the copy of the dataframe
title.shape

(9868, 15)

In [31]:
# Drop the title column from the dataframe and show the first few rows of the dataframe
title.drop(['seasons'], axis=1).head(5)

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts20945,The Three Stooges,SHOW,The Three Stooges were an American vaudeville ...,1934,TV-PG,19,"['comedy', 'family', 'animation', 'action', 'f...",['US'],tt0850645,8.6,1092.0,15.424,7.6
1,tm19248,The General,MOVIE,"During America’s Civil War, Union spies steal ...",1926,,78,"['action', 'drama', 'war', 'western', 'comedy'...",['US'],tt0017925,8.2,89766.0,8.647,8.0
2,tm82253,The Best Years of Our Lives,MOVIE,It's the hope that sustains the spirit of ever...,1946,,171,"['romance', 'war', 'drama']",['US'],tt0036868,8.1,63026.0,8.435,7.8
3,tm83884,His Girl Friday,MOVIE,"Hildy, the journalist former wife of newspaper...",1940,,92,"['comedy', 'drama', 'romance']",['US'],tt0032599,7.8,57835.0,11.27,7.4
4,tm56584,In a Lonely Place,MOVIE,An aspiring actress begins to suspect that her...,1950,,94,"['thriller', 'drama', 'romance']",['US'],tt0042593,7.9,30924.0,8.273,7.6
