In [89]:
import pandas as pd
import numpy as np 

import statistics
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", 25)

In [5]:
countries_data = pd.read_csv("Data/all-weeks-countries.csv", encoding = 'cp1252')
countries_data.head()

Unnamed: 0,country_name,country_iso2,week,category,weekly_rank,show_title,season_title,cumulative_weeks_in_top_10
0,Argentina,AR,2023-03-05,Films,1,We Have a Ghost,,2
1,Argentina,AR,2023-03-05,Films,2,Bad Boys for Life,,2
2,Argentina,AR,2023-03-05,Films,3,The Condemned,,1
3,Argentina,AR,2023-03-05,Films,4,Love at First Kiss,,1
4,Argentina,AR,2023-03-05,Films,5,Tonight You're Sleeping with Me,,1


In [247]:
# countries_data.country_name.nunique()
# countries_data.week.max()
countries_data.category.value_counts()

Films    82190
TV       82190
Name: category, dtype: int64

In [7]:
global_data = pd.read_csv("Data/all-weeks-global.csv", encoding = 'cp1252')
global_data.head()

Unnamed: 0,week,category,weekly_rank,show_title,season_title,weekly_hours_viewed,cumulative_weeks_in_top_10
0,2023-03-05,Films (English),1,We Have a Ghost,,42620000,2
1,2023-03-05,Films (English),2,The Strays,,14570000,2
2,2023-03-05,Films (English),3,The Condemned,,10230000,1
3,2023-03-05,Films (English),4,R.I.P.D.,,8330000,1
4,2023-03-05,Films (English),5,Bad Boys for Life,,8330000,2


In [249]:
global_data.week.min()

'2021-07-04'

In [11]:
popular_data = pd.read_csv("Data/most-popular.csv", encoding = 'cp1252')
popular_data.head()

Unnamed: 0,category,rank,show_title,season_title,hours_viewed_first_28_days
0,Films (English),1,Red Notice,,364020000
1,Films (English),2,Don't Look Up,,359790000
2,Films (English),3,Bird Box,,282020000
3,Films (English),4,Glass Onion: A Knives Out Mystery,,279740000
4,Films (English),5,The Gray Man,,253870000


In [251]:
popular_data.category.value_counts()

Films (English)        10
Films (Non-English)    10
TV (English)           10
TV (Non-English)       10
Name: category, dtype: int64

In [78]:
shows_data = pd.read_csv("Data/netflix_shows.csv", encoding = 'cp1252')
shows_data.head()

Unnamed: 0,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,type
0,Breaking Bad,2008.0,18+,9.5,96%,1,0,0,0,1.0
1,Stranger Things,2016.0,16+,8.8,93%,1,0,0,0,1.0
2,Money Heist,2017.0,18+,8.4,91%,1,0,0,0,1.0
3,Sherlock,2010.0,16+,9.1,78%,1,0,0,0,1.0
4,Better Call Saul,2015.0,18+,8.7,97%,1,0,0,0,1.0


In [21]:
shows_data.columns = shows_data.columns.str.replace(" ","_") 

In [22]:
shows_data.sample(2)

Unnamed: 0,Title,Year,Age,IMDb,Rotten_Tomatoes,Netflix,Hulu,Prime_Video,Disney+,type
4000,Le Mans: Racing is Everything,2017,7+,7.4,,0,0,1,0,1.0
1575,Back Street Girls: Goku Dolls,2018,,6.6,,1,0,0,0,1.0


## Statistical Analysis

### 1. Is there a significant difference in IMDb ratings between shows available on Netflix, Hulu, Prime Video, and Disney+?

In [16]:
shows_data.Netflix.value_counts()

0    3680
1    1931
Name: Netflix, dtype: int64

In [17]:
shows_data.Hulu.value_counts()

0    3856
1    1755
Name: Hulu, dtype: int64

In [23]:
shows_data['Prime_Video'].value_counts()

0    3469
1    2142
Name: Prime_Video, dtype: int64

In [25]:
shows_data['Disney+'].value_counts()

0    5427
1     184
Name: Disney+, dtype: int64

In [65]:
## assume significance level = 0.05

alpha = 0.05

## extract the data, make sure no null value is selected and sampling it for better random results

netflix = shows_data[shows_data.Netflix == 1]['IMDb'].dropna()
hulu = shows_data[shows_data.Hulu == 1]['IMDb'].dropna()
prime_video = shows_data[shows_data.Prime_Video == 1]['IMDb'].dropna()
disney = shows_data[shows_data['Disney+'] == 1]['IMDb'].dropna()

In [132]:
## applying anova test

f_statistics, p_value = stats.f_oneway(netflix, hulu, prime_video, disney)

## results

print(f"P-value for the ANOVA test is {p_value} and F-Statistics value is {f_statistics}")


## conclusion
if p_value > alpha:
    print(f"As P-value, {p_value} is greater than significance level, {alpha}, so we fail in Rejecting NULL Hypothesis, conclude that there is no significant difference.")
else:
    print(f"As P-value,{p_value} is smaller than significance level, {alpha}, so we will be Rejecting NULL Hypothesis conclude that there is a significant difference in IMDb ratings among the streaming platforms.")


P-value for the ANOVA test is 0.0014769661774716425 and F-Statistics value is 5.154318030100773
As P-value,0.0014769661774716425 is smaller than significance level, 0.05, so we will be Rejecting NULL Hypothesis conclude that there is a significant difference in IMDb ratings among the streaming platforms.


In [67]:
## manually

netflix.mean(), hulu.mean(), prime_video.mean(), disney.mean()

(7.165524861878453, 7.061015831134565, 7.179326186830016, 6.923668639053254)

### 2. Does the age rating (e.g., 16+, 18+) of TV shows have an impact on their IMDb ratings?


In [79]:
shows_data.columns

Index(['Title', 'Year', 'Age', 'IMDb', 'Rotten Tomatoes', 'Netflix', 'Hulu',
       'Prime Video', 'Disney+', 'type'],
      dtype='object')

In [80]:
shows_data.Age.value_counts()

16+    1018
7+      848
18+     750
all     545
13+       4
Name: Age, dtype: int64

In [83]:
# shows_data.query("Age == ['3.1', '7.2', '7.8']")  #data cleaning

In [123]:
## filtering population and taking only 50% of population

age_7 = shows_data[shows_data['Age'] == '7+']['IMDb'].dropna()
age_13 = shows_data[shows_data['Age'] == '13+']['IMDb'].dropna()
age_16 = shows_data[shows_data['Age'] == '16+']['IMDb'].dropna()
age_18 = shows_data[shows_data['Age'] == '18+']['IMDb'].dropna()
age_all = shows_data[shows_data['Age'] == 'all']['IMDb'].dropna()

In [124]:
age_7.shape, age_13.shape, age_16.shape, age_18.shape, age_all.shape

((838,), (4,), (1013,), (749,), (532,))

In [125]:
age_7.mean(), age_13.mean(), age_16.mean(), age_18.mean(), age_all.mean()

(7.062171837708831,
 6.0,
 7.2755182625863775,
 7.398130841121495,
 6.874624060150376)

In [126]:
age_13

1116    6.6
1123    6.4
1514    5.7
4024    5.3
Name: IMDb, dtype: float64

In [127]:
datapoints = [age_7, age_13, age_16, age_18, age_all]

In [128]:
## plotting distplot to visulaize the dats

def displot_func(data):
    plt.figure(figsize = (8,5))
    sns.displot(data)
    plt.axvline(x = data.mean(), color = 'red', linestyle = 'dotted', linewidth = 2)
    # plt.title("Distplot of the", data)
    plt.show()

**we choose a t-test when comparing the means of two groups, while ANOVA is used when comparing the means of more than two groups.**

In [135]:
## using ANOVA test due to different categorical values in a column

## using 50% samples from each category

## significance level = 0.50
alpha = 0.50

f_statistics, p_value = stats.f_oneway(age_7, age_16, age_18, age_all)

print(f"The p-value for the test is {p_value} and f_statistics value is {f_statistics}")

print("\nConclusion : \n")

if p_value > alpha:
    print(f"P-value for the following test is {p_value} and that's greater than significance value, so we cannot REJECT the NULL Hypothesis, means there is no age effects in IMDb ratings")
else:
    print("We REJECT the NULL Hypothesis and conclude that there is a signifance effects of age ratings on IMDb ratings")

The p-value for the test is 4.056435723903755e-19 and f_statistics value is 30.00355682223527

Conclusion : 

We REJECT the NULL Hypothesis and conclude that there is a signifance effects of age ratings on IMDb ratings


### 3. Are there significant differences in IMDb ratings among TV shows in different categories (e.g., Films, Series)?


In [136]:
shows_data.columns

Index(['Title', 'Year', 'Age', 'IMDb', 'Rotten Tomatoes', 'Netflix', 'Hulu',
       'Prime Video', 'Disney+', 'type'],
      dtype='object')

In [138]:
shows_data.type.value_counts()

1.0    5610
Name: type, dtype: int64

In [139]:
popular_data.category.value_counts()

Films (English)        10
Films (Non-English)    10
TV (English)           10
TV (Non-English)       10
Name: category, dtype: int64

In [142]:
countries_data.category.value_counts()

Films    82190
TV       82190
Name: category, dtype: int64

In [143]:
shows_data.shape

(5611, 10)

In [149]:
global_data.columns, global_data.shape

(Index(['week', 'category', 'weekly_rank', 'show_title', 'season_title',
        'weekly_hours_viewed', 'cumulative_weeks_in_top_10'],
       dtype='object'),
 (3520, 7))

In [148]:
global_data.category.value_counts()

Films (English)        880
Films (Non-English)    880
TV (English)           880
TV (Non-English)       880
Name: category, dtype: int64

#### Merging `Global Data` with 'Shows Data` on title column and it's an inner join

In [226]:
glob_show_data = pd.merge(global_data, shows_data, left_on = 'show_title' , right_on = 'Title', how = 'inner')

In [217]:
glob_show_data.Title.nunique()

97

In [218]:
glob_show_data.shape

(588, 17)

In [219]:
glob_show_data.head()

Unnamed: 0,week,category,weekly_rank,show_title,season_title,weekly_hours_viewed,cumulative_weeks_in_top_10,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,type
0,2023-03-05,TV (English),1,Outer Banks,Outer Banks: Season 3,99000000,2,Outer Banks,2020.0,18+,7.6,72%,1,0,0,0,1.0
1,2023-03-05,TV (English),4,Outer Banks,Outer Banks: Season 1,34050000,7,Outer Banks,2020.0,18+,7.6,72%,1,0,0,0,1.0
2,2023-03-05,TV (English),6,Outer Banks,Outer Banks: Season 2,27750000,8,Outer Banks,2020.0,18+,7.6,72%,1,0,0,0,1.0
3,2023-02-26,TV (English),1,Outer Banks,Outer Banks: Season 3,154970000,1,Outer Banks,2020.0,18+,7.6,72%,1,0,0,0,1.0
4,2023-02-26,TV (English),4,Outer Banks,Outer Banks: Season 1,35100000,6,Outer Banks,2020.0,18+,7.6,72%,1,0,0,0,1.0


In [220]:
glob_show_data.category.value_counts()

TV (English)           410
TV (Non-English)       145
Films (English)         26
Films (Non-English)      7
Name: category, dtype: int64

In [227]:
## filtering data and considering not null values only

tv_eng = glob_show_data[glob_show_data.category == 'TV (English)']['IMDb'].dropna()
tv_noneng = glob_show_data[glob_show_data.category == 'TV (Non-English)']['IMDb'].dropna()
films_eng = glob_show_data[glob_show_data.category == 'Films (English)']['IMDb'].dropna()
films_noneng = glob_show_data[glob_show_data.category == 'Films (Non-English)']['IMDb'].dropna()

In [235]:
## performing t-test

t_statistics, p_value = stats.ttest_ind(tv_eng, tv_noneng)

In [236]:
## setting significance level = 0.05

alpha = 0.05

print(f"T-statistics value is {t_statistics} and P-Value is {p_value}")

T-statistics value is 0.4994830838794304 and P-Value is 0.6176377607433152


In [224]:
# tv.mean(), films.mean()

In [233]:
## performing ANOVA test

f_statistics, p_value_anova = stats.f_oneway(tv_eng, tv_noneng, films_eng, films_noneng)

In [239]:
## setting significance level = 0.05

alpha = 0.05

print(f"F-statistics value is {f_statistics} and P-Value is {p_value_anova}")

print("\nConclusion : \n")

if p_value_anova > alpha:
    print(f"P-Value, {p_value_anova} is greater than alpha, {alpha}, We fail to REJECT the NULL Hypothesis and conclude that there is significant difference in IMDb ratings in different categories")
else:
    print("We will REJECT the NULL Hypothesis, concludes that there is no significance differences on ratings on IMDb in different category")

F-statistics value is 8.888317567654266 and P-Value is 9.121234076879603e-06

Conclusion : 

We will REJECT the NULL Hypothesis, concludes that there is no significance differences on ratings on IMDb in different category
