In [1]:
# Import dependencies
import requests
import pandas as pd

from config import api_key

In [2]:
# Load the dataset into a DataFrame
file_path = "Kaggle/imdb_movies_shows.csv"
netflix_data = pd.read_csv(file_path)
netflix_data.head()

Unnamed: 0,title,type,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes
0,Five Came Back: The Reference Films,SHOW,1945,TV-MA,48,['documentation'],['US'],1.0,,,
1,Taxi Driver,MOVIE,1976,R,113,"['crime', 'drama']",['US'],,tt0075314,8.3,795222.0
2,Monty Python and the Holy Grail,MOVIE,1975,PG,91,"['comedy', 'fantasy']",['GB'],,tt0071853,8.2,530877.0
3,Life of Brian,MOVIE,1979,R,94,['comedy'],['GB'],,tt0079470,8.0,392419.0
4,The Exorcist,MOVIE,1973,R,133,['horror'],['US'],,tt0070047,8.1,391942.0


In [3]:
# Filter to keep only entries where Type is 'SHOW'
netflix_shows = netflix_data.loc[netflix_data["type"] == "SHOW", :]

# Display the first few rows of the netflix_shows DataFrame
netflix_shows.head()

Unnamed: 0,title,type,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes
0,Five Came Back: The Reference Films,SHOW,1945,TV-MA,48,['documentation'],['US'],1.0,,,
5,Monty Python's Flying Circus,SHOW,1969,TV-14,30,"['comedy', 'european']",['GB'],4.0,tt0063929,8.8,72895.0
29,Monty Python's Fliegender Zirkus,SHOW,1972,TV-MA,43,['comedy'],[],1.0,tt0202477,8.1,2144.0
47,Seinfeld,SHOW,1989,TV-PG,24,['comedy'],['US'],9.0,tt0098904,8.9,302700.0
55,Knight Rider,SHOW,1982,TV-PG,51,"['action', 'scifi', 'crime', 'drama']",['US'],4.0,tt0083437,6.9,33760.0


In [4]:
#Analyse the shape of the dataset
netflix_shows.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2047 entries, 0 to 5805
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   title                 2047 non-null   object 
 1   type                  2047 non-null   object 
 2   release_year          2047 non-null   int64  
 3   age_certification     1790 non-null   object 
 4   runtime               2047 non-null   int64  
 5   genres                2047 non-null   object 
 6   production_countries  2047 non-null   object 
 7   seasons               2047 non-null   float64
 8   imdb_id               1911 non-null   object 
 9   imdb_score            1876 non-null   float64
 10  imdb_votes            1876 non-null   float64
dtypes: float64(3), int64(2), object(6)
memory usage: 191.9+ KB


In [5]:
#Removing N/A entries from the DataFrame 
netflix_shows = netflix_shows.dropna()
netflix_shows.info()

#Reset the index 
netflix_shows = netflix_shows.reset_index(drop = "True")

<class 'pandas.core.frame.DataFrame'>
Index: 1670 entries, 5 to 5796
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   title                 1670 non-null   object 
 1   type                  1670 non-null   object 
 2   release_year          1670 non-null   int64  
 3   age_certification     1670 non-null   object 
 4   runtime               1670 non-null   int64  
 5   genres                1670 non-null   object 
 6   production_countries  1670 non-null   object 
 7   seasons               1670 non-null   float64
 8   imdb_id               1670 non-null   object 
 9   imdb_score            1670 non-null   float64
 10  imdb_votes            1670 non-null   float64
dtypes: float64(3), int64(2), object(6)
memory usage: 156.6+ KB


In [6]:
#Checking that all IMDB IDs are unique 
netflix_shows["imdb_id"].nunique()

1670

In [8]:
netflix_shows.columns

Index(['title', 'type', 'release_year', 'age_certification', 'runtime',
       'genres', 'production_countries', 'seasons', 'imdb_id', 'imdb_score',
       'imdb_votes'],
      dtype='object')

In [9]:
#Using OMDB to add a column for languages using the imdb_id - OMDB had a limit of a 1000 calls a day.

netflix_shows["Language"] = ""

# Iterate through the netflix_shows DataFrame
for index, row in netflix_shows.iterrows():
    # Get imdb_id from the DataFrame
    imdb_id = row["imdb_id"]

    # Set base URL for OMDB API
    base_url = "http://www.omdbapi.com/"

    # Make an API request using the imdb_id
    response = requests.get(base_url, params={"i": imdb_id, "apikey": api_key})

    # Convert the API response to JSON format
    show_data = response.json()

    # Grab the language from the results and store it in the netflix_shows DataFrame as a list
    try:
        netflix_shows.loc[index, "Language"] = show_data["Language"]
    except:
        # If title is not found, print the message and set as N/A 
        print(f"{imdb_id} - not found")
        netflix_shows.loc[index, "Language"] = "N/A"
        
        

        

tt10020984 - not found
tt13976072 - not found
tt4531748 - not found
tt10800582 - not found
tt15438396 - not found
tt6818948 - not found
tt9165300 - not found
tt9889712 - not found
tt7607544 - not found
tt10751504 - not found
tt18224644 - not found
tt18274456 - not found


In [11]:
#Show the updated DataFrame 
#Removing N/A entries from the DataFrame 
netflix_shows = netflix_shows.dropna()
netflix_shows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1670 entries, 0 to 1669
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   title                 1670 non-null   object 
 1   type                  1670 non-null   object 
 2   release_year          1670 non-null   int64  
 3   age_certification     1670 non-null   object 
 4   runtime               1670 non-null   int64  
 5   genres                1670 non-null   object 
 6   production_countries  1670 non-null   object 
 7   seasons               1670 non-null   float64
 8   imdb_id               1670 non-null   object 
 9   imdb_score            1670 non-null   float64
 10  imdb_votes            1670 non-null   float64
 11  Language              1670 non-null   object 
dtypes: float64(3), int64(2), object(7)
memory usage: 156.7+ KB


In [12]:
#Reset the index 
netflix_shows = netflix_shows.reset_index(drop = "True")
netflix_shows

Unnamed: 0,title,type,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,Language
0,Monty Python's Flying Circus,SHOW,1969,TV-14,30,"['comedy', 'european']",['GB'],4.0,tt0063929,8.8,72895.0,"English, Arabic, French, German, Italian, Mand..."
1,Monty Python's Fliegender Zirkus,SHOW,1972,TV-MA,43,['comedy'],[],1.0,tt0202477,8.1,2144.0,"English, German"
2,Seinfeld,SHOW,1989,TV-PG,24,['comedy'],['US'],9.0,tt0098904,8.9,302700.0,English
3,Knight Rider,SHOW,1982,TV-PG,51,"['action', 'scifi', 'crime', 'drama']",['US'],4.0,tt0083437,6.9,33760.0,English
4,Thomas & Friends,SHOW,1984,TV-Y,10,"['family', 'comedy', 'music', 'action', 'anima...",['GB'],24.0,tt0086815,6.5,4948.0,"Spanish, French, Chinese, Japanese, English"
...,...,...,...,...,...,...,...,...,...,...,...,...
1665,Abla Fahita: Drama Queen,SHOW,2021,TV-14,25,"['drama', 'comedy', 'crime']",['XX'],1.0,tt11570224,6.6,433.0,"Arabic, Turkish, English"
1666,Christmas Flow,SHOW,2021,TV-MA,50,"['music', 'romance', 'comedy']",['FR'],1.0,tt15340790,5.8,702.0,French
1667,Korean Cold Noodle Rhapsody,SHOW,2021,TV-PG,49,['documentation'],['KR'],1.0,tt15772846,7.3,15.0,Korean
1668,Pitta Kathalu,SHOW,2021,TV-MA,37,"['drama', 'romance']",['IN'],1.0,tt13879000,5.1,727.0,


In [18]:
# Make new data frame and drop columns
shows_df = netflix_shows[['title', 'release_year', 'age_certification', 'runtime', 'Language',
       'genres', 'production_countries', 'seasons', 'imdb_score',
       'imdb_votes']]
shows_df

Unnamed: 0,title,release_year,age_certification,runtime,Language,genres,production_countries,seasons,imdb_score,imdb_votes
0,Monty Python's Flying Circus,1969,TV-14,30,"English, Arabic, French, German, Italian, Mand...","['comedy', 'european']",['GB'],4.0,8.8,72895.0
1,Monty Python's Fliegender Zirkus,1972,TV-MA,43,"English, German",['comedy'],[],1.0,8.1,2144.0
2,Seinfeld,1989,TV-PG,24,English,['comedy'],['US'],9.0,8.9,302700.0
3,Knight Rider,1982,TV-PG,51,English,"['action', 'scifi', 'crime', 'drama']",['US'],4.0,6.9,33760.0
4,Thomas & Friends,1984,TV-Y,10,"Spanish, French, Chinese, Japanese, English","['family', 'comedy', 'music', 'action', 'anima...",['GB'],24.0,6.5,4948.0
...,...,...,...,...,...,...,...,...,...,...
1665,Abla Fahita: Drama Queen,2021,TV-14,25,"Arabic, Turkish, English","['drama', 'comedy', 'crime']",['XX'],1.0,6.6,433.0
1666,Christmas Flow,2021,TV-MA,50,French,"['music', 'romance', 'comedy']",['FR'],1.0,5.8,702.0
1667,Korean Cold Noodle Rhapsody,2021,TV-PG,49,Korean,['documentation'],['KR'],1.0,7.3,15.0
1668,Pitta Kathalu,2021,TV-MA,37,,"['drama', 'romance']",['IN'],1.0,5.1,727.0


In [20]:
# Define the file path to save the CSV
csv_file_path = 'Output/netflix_shows.csv'

# Save the DataFrame to a CSV file
netflix_shows.to_csv(csv_file_path, index=False)
