In [1]:
# Import dependencies
import requests
import pandas as pd

from config import api_key

In [2]:
# Load the dataset into a DataFrame
file_path = "Kaggle/imdb_movies_shows.csv"
netflix_data = pd.read_csv(file_path)
netflix_data.head()

Unnamed: 0,title,type,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes
0,Five Came Back: The Reference Films,SHOW,1945,TV-MA,48,['documentation'],['US'],1.0,,,
1,Taxi Driver,MOVIE,1976,R,113,"['crime', 'drama']",['US'],,tt0075314,8.3,795222.0
2,Monty Python and the Holy Grail,MOVIE,1975,PG,91,"['comedy', 'fantasy']",['GB'],,tt0071853,8.2,530877.0
3,Life of Brian,MOVIE,1979,R,94,['comedy'],['GB'],,tt0079470,8.0,392419.0
4,The Exorcist,MOVIE,1973,R,133,['horror'],['US'],,tt0070047,8.1,391942.0


In [3]:
# Filter to keep only entries where Type is 'SHOW'
netflix_shows = netflix_data.loc[netflix_data["type"] == "SHOW", :]

#Reset the index as only data on TV Shows will be analysed
netflix_shows = netflix_shows.reset_index(drop = "True")

# Display the first few rows of the netflix_shows DataFrame
netflix_shows.head()

Unnamed: 0,title,type,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes
0,Five Came Back: The Reference Films,SHOW,1945,TV-MA,48,['documentation'],['US'],1.0,,,
1,Monty Python's Flying Circus,SHOW,1969,TV-14,30,"['comedy', 'european']",['GB'],4.0,tt0063929,8.8,72895.0
2,Monty Python's Fliegender Zirkus,SHOW,1972,TV-MA,43,['comedy'],[],1.0,tt0202477,8.1,2144.0
3,Seinfeld,SHOW,1989,TV-PG,24,['comedy'],['US'],9.0,tt0098904,8.9,302700.0
4,Knight Rider,SHOW,1982,TV-PG,51,"['action', 'scifi', 'crime', 'drama']",['US'],4.0,tt0083437,6.9,33760.0


In [4]:
#Analyse the shape of the dataset
netflix_shows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2047 entries, 0 to 2046
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   title                 2047 non-null   object 
 1   type                  2047 non-null   object 
 2   release_year          2047 non-null   int64  
 3   age_certification     1790 non-null   object 
 4   runtime               2047 non-null   int64  
 5   genres                2047 non-null   object 
 6   production_countries  2047 non-null   object 
 7   seasons               2047 non-null   float64
 8   imdb_id               1911 non-null   object 
 9   imdb_score            1876 non-null   float64
 10  imdb_votes            1876 non-null   float64
dtypes: float64(3), int64(2), object(6)
memory usage: 176.0+ KB


In [5]:
#Removing N/A entries from the DataFrame 
netflix_shows = netflix_shows.dropna()
netflix_shows.tail()

Unnamed: 0,title,type,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes
2036,Abla Fahita: Drama Queen,SHOW,2021,TV-14,25,"['drama', 'comedy', 'crime']",['XX'],1.0,tt11570224,6.6,433.0
2039,Christmas Flow,SHOW,2021,TV-MA,50,"['music', 'romance', 'comedy']",['FR'],1.0,tt15340790,5.8,702.0
2040,Korean Cold Noodle Rhapsody,SHOW,2021,TV-PG,49,['documentation'],['KR'],1.0,tt15772846,7.3,15.0
2041,Pitta Kathalu,SHOW,2021,TV-MA,37,"['drama', 'romance']",['IN'],1.0,tt13879000,5.1,727.0
2044,The Big Day,SHOW,2021,TV-MA,45,"['reality', 'romance']",['US'],2.0,tt13887518,4.6,327.0


In [6]:
#Checking that all IMDB IDs are unique 
netflix_shows["imdb_id"].nunique()

1670

In [7]:
netflix_shows.head()

Unnamed: 0,title,type,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes
1,Monty Python's Flying Circus,SHOW,1969,TV-14,30,"['comedy', 'european']",['GB'],4.0,tt0063929,8.8,72895.0
2,Monty Python's Fliegender Zirkus,SHOW,1972,TV-MA,43,['comedy'],[],1.0,tt0202477,8.1,2144.0
3,Seinfeld,SHOW,1989,TV-PG,24,['comedy'],['US'],9.0,tt0098904,8.9,302700.0
4,Knight Rider,SHOW,1982,TV-PG,51,"['action', 'scifi', 'crime', 'drama']",['US'],4.0,tt0083437,6.9,33760.0
5,Thomas & Friends,SHOW,1984,TV-Y,10,"['family', 'comedy', 'music', 'action', 'anima...",['GB'],24.0,tt0086815,6.5,4948.0


In [8]:
#Using OMDB to add a column for languages using the imdb_id - OMDB had a limit of a 1000 calls a day.

netflix_shows["Language"] = ""

# Iterate through the netflix_shows DataFrame
for index, row in netflix_shows.iterrows():
    # Get imdb_id from the DataFrame
    imdb_id = row["imdb_id"]

    # Set base URL for OMDB API
    base_url = "http://www.omdbapi.com/"

    # Make an API request using the imdb_id
    response = requests.get(base_url, params={"i": imdb_id, "apikey": api_key})

    # Convert the API response to JSON format
    show_data = response.json()

    # Grab the language from the results and store it in the netflix_shows DataFrame as a list
    try:
        netflix_shows.loc[index, "Language"] = show_data["Language"]
    except:
        # If title is not found, print the message and set as N/A 
        print(f"{imdb_id} - not found")
        netflix_shows.loc[index, "Language"] = "N/A"
        
        

        

tt10020984 - not found
tt13976072 - not found
tt4531748 - not found
tt10800582 - not found
tt15438396 - not found
tt6818948 - not found
tt9165300 - not found
tt9889712 - not found
tt7607544 - not found
tt10751504 - not found


KeyboardInterrupt: 

In [16]:
#Show the updated DataFrame 
netflix_shows

SyntaxError: invalid syntax (895527370.py, line 2)