In [1]:
import os 
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from fuzzywuzzy import fuzz
from multiprocessing import Pool
from time import time
from scipy import interpolate

ModuleNotFoundError: No module named 'fuzzywuzzy'

https://datasets.imdbws.com/
https://www.kaggle.com/datasets/shivamb/netflix-shows

### Data Cleaning and Processing

In [None]:
# reading data sets
df_netflix = pd.read_csv("NetflixData/Content/netflix_titles.csv")
df_titles = pd.read_csv("NetflixData/Content/titles.csv")
df_ratings = pd.read_csv("NetflixData/Content/ratings.csv")

# Selecting required fields and joining data frames 
df_netflix = df_netflix[['title','date_added']]
df_titles = df_titles[['tconst','primaryTitle']]

# joining imbdb data sets
df_imdb = pd.merge(df_titles,df_ratings,how="inner",left_on="tconst",right_on="tconst")

# Generating all imdb title
imdb_titles = [x for x in df_imdb['primaryTitle'].values]

In [None]:
df_imdb.head(2)

In [None]:
# Function to fetch rating from Imbdb data and generate report of matched titles

# Code commented due to large runtime of 8 hrs. Results are saved and loaded from 
# csv file for Future use.

# matched_report = {"Netflix":[],"Imdb":[],"Ratio":[],"Votes":[],"Rating":[]}
# for x in df_netflix.iterrows():
#     title = x[1][0]
#     with Pool() as pool:
#         args = zip([title.lower()]*len(imdb_titles),[x.lower() for x in imdb_titles])
#         ratios = pool.starmap(fuzz.ratio,args)
#     maxx = max(ratios)
#     inx = ratios.index(maxx)
#     matched_title = imdb_titles[inx]
#     rating = float(df_imdb[df_imdb['primaryTitle']==matched_title]['averageRating'].values[0])
#     votes = float(df_imdb[df_imdb['primaryTitle']==matched_title]['numVotes'].values[0])
#     print(title,matched_title,rating,votes)
        
#     matched_report["Netflix"].append(title)
#     matched_report["Imdb"].append(matched_title)
#     matched_report['Rating'].append(rating)
#     matched_report["Votes"].append(votes)
#     matched_report['Ratio'].append(maxx)
    
# matched_report = pd.DataFrame(matched_report)    
matched_report = pd.read_csv('matched_report.csv')



In [None]:
matched_report.head(2)

In [None]:
df_netflix.head(2)

In [None]:
# Data Loss curve based on threshold of similarity 
# Here we make the assumption that availiablity or similarity of name in the IMDB dataset is 
# independent of rating and hence a sufficient size sample will give a reasonable 
# estimate of the mean.
# As we are getting a sufficient samples even at a high threshhold we are setting 
# limits of similarity to 95 % and avoid any possible mismatching.
plt.figure(figsize=(5,5))
plt.plot(range(0,100,1),[len(matched_report[matched_report['Ratio']>x]) for x in range(0,100,1)])
plt.axvline(95,linestyle=":",color="black")
plt.axhline(2597,linestyle=":",color="black")
plt.xlabel('Threshold Ratio')
plt.ylabel('Number of Titles Matched')
plt.title('Fig 1(Appendix) : Number of Titles Matched vs Threshold of fuzzy matching',fontsize=12)
plt.savefig("content_fig_1(appendix).png")
plt.show()

In [None]:
# Filtering and Joining netflix data to matched titles
df_final = pd.merge(df_netflix,matched_report[matched_report['Ratio']>95] \
                    [["Netflix","Rating","Votes"]],how='inner',right_on="Netflix", \
                    left_on="title").drop("Netflix",axis=1)

# Convertting to datetime and calculating year and quarter.

df_final["date_added"] = pd.to_datetime(df_final["date_added"])

def quarter(x):
    month = x['date_added'].month
    if month%3>0:
        return int(month/3) + 1
    else :
        return month/3
    
def year(x):
    return x['date_added'].year

df_final = df_final.dropna()
df_final["Quarter"] = df_final.apply(lambda x:quarter(x),axis=1)
df_final["Year"] = df_final.apply(lambda x:year(x),axis=1)

# Top ten Percentile titles.
imdb_sorted = df_imdb.sort_values("averageRating",ascending=False)
top_ten_rating = imdb_sorted.iloc[int(len(imdb_sorted)/10)]['averageRating']
def top_ten(x):
    rating = x["Rating"]
    if rating >= top_ten_rating:
        return 1
    else:
        return 0

df_final["Top Ranked"] = df_final.apply(lambda x:top_ten(x),axis=1)
df_final["Show Added"] = 1

df_Netflix = df_final.groupby(["Year","Quarter"],as_index=False).mean()
df_Netflix["Top Ranked"] = df_final.groupby(["Year","Quarter"],as_index=False).sum()["Top Ranked"]
df_Netflix["Number of Titles Added"] = df_final.groupby(["Year","Quarter"],as_index=False).sum()["Show Added"]

df_Netflix = df_Netflix[df_Netflix["Year"]>=2018]
df_Netflix['Date'] = df_Netflix['Year'].astype('str')+["-"]*len(df_Netflix)+df_Netflix['Quarter'].astype('str')


# Calculating Rank/Percentile of Content 

def percentile(x):
    rating = x['Rating']
    return len(df_imdb[df_imdb["averageRating"]<rating])/len(df_imdb)*100
df_Netflix['Percentile'] = df_Netflix.apply(lambda x:percentile(x),axis=1)



In [None]:
df_Netflix

In [None]:
df_final.groupby(["Year","Quarter"],as_index=False).sum()

In [None]:
df_final

In [None]:
type(imdb_sorted.iloc[int(len(imdb_sorted)/10)]['averageRating'])

In [None]:
df_Netflix.head(2)

### DATA VISUALISATION

In [None]:
fig = plt.figure(figsize=(15, 7.5))
fig.suptitle("Fig 1: Netflix Content Average Reviews",fontsize=24)
plt.plot(range(len(df_Netflix)),df_Netflix['Rating'])
plt.axvline(7,linestyle=":",color="black")
plt.xlabel('Quarter')
plt.ylabel('Average IMDB Rating')
plt.savefig("content_fig_1.png")
print("Note : Each index of x axis cooresponds to a quarter. With index zero for 2018 Q1 and index 17 for 2022 Q2")

In [None]:
fig = plt.figure(figsize=(15, 7.5))
fig.suptitle("Fig 2: Netflix Content Percentile of Average Reviews",fontsize=24)
plt.plot(range(len(df_Netflix)),df_Netflix['Percentile'])
plt.axvline(7,linestyle=":",color="black")
plt.xlabel('Quarter')
plt.savefig("content_fig_2.png")
print("Note : Each index of x axis cooresponds to a quarter. With index zero for 2018 Q1 and index 17 for 2022 Q2")

In [None]:
fig = plt.figure(figsize=(15,7.5))
fig.suptitle("Fig 3: Netflix Content - Top 10 Percentile Content",fontsize=24)
plt.plot(range(len(df_Netflix)),df_Netflix['Top Ranked'])
plt.axvline(7,linestyle=":",color="black")
plt.xlabel('Quarter')
plt.savefig("content_fig_3.png")
print("Note : Each index of x axis cooresponds to a quarter. With index zero for 2018 Q1 and index 17 for 2022 Q2")

In [None]:
fig = plt.figure(figsize=(15,7.5)\
                 
fig.suptitle("Fig 4: Netflix Content - Number of Titles Added in the quarter",fontsize=24)
plt.plot(range(len(df_Netflix)),df_Netflix['Number of Titles Added'])
plt.axvline(7,linestyle=":",color="black")
plt.xlabel('Quarter')
plt.savefig("content_fig_3.png")
print("Note : Each index of x axis cooresponds to a quarter. With index zero for 2018 Q1 and index 17 for 2022 Q2")