In [27]:
import pandas as pd
import numpy as np
from scipy import stats

file_path = 'netflix_titles.csv'
netflix_df = pd.read_csv(file_path)


In [28]:
# Filter for TV Shows only
tv_shows_df = netflix_df[netflix_df['type'] == 'TV Show'].copy()


In [29]:
# Group by release_year and count the number of TV shows per year
tv_shows_per_year = tv_shows_df.groupby('release_year')\
                               .size().reset_index(name='count')

In [30]:
# Define our two periods

period_1_counts = tv_shows_per_year[(tv_shows_per_year['release_year'] >= 2012) 
                  & (tv_shows_per_year['release_year'] <= 2016)]['count']

period_2_counts = tv_shows_per_year[(tv_shows_per_year['release_year'] >= 2017) 
                  & (tv_shows_per_year['release_year'] <= 2021)]['count']


In [31]:
# Calculate the average number of TV shows for each period
avg_tv_shows_period_1 = period_1_counts.mean()
avg_tv_shows_period_2 = period_2_counts.mean()

print(f"Average TV Shows per year (2012–2016):\
      {avg_tv_shows_period_1:.2f}")
print(f"Average TV Shows per year (2017–2021):\
      {avg_tv_shows_period_2:.2f}")

Average TV Shows per year (2012–2016):      124.20
Average TV Shows per year (2017–2021):      358.60


In [None]:
# Perform an independent t-test
t_statistic, p_value = stats.ttest_ind(period_2_counts, 
                                       period_1_counts,
                                       equal_var=False, 
                                       alternative='greater')
print(f"\nT-statistic: {t_statistic:.2f}")
print(f"P-value: {p_value:.4f}")


T-statistic: 5.05
P-value: 0.0005


In [None]:
# Set a significance level (alpha)
alpha = 0.05

In [None]:
# Make a decision based on the p-value
print("\n--- Hypothesis Test Decision ---")
if p_value < alpha:
    print(f"Since p-value ({p_value:.4f}) < alpha ({alpha}), \
          we REJECT the Null Hypothesis.")
    print("Conclusion: There is statistically significant evidence \
          that the average number of TV shows added per year \
          in 2017-2021 is greater than in 2012-2016.")
else:
    print(f"Since p-value ({p_value:.4f}) >= alpha ({alpha}), \
          we FAIL TO REJECT the Null Hypothesis.")
    print("Conclusion: There is not enough statistically\
          significant evidence to claim that the average number \
          of TV shows added per year in 2017-2021 is greater than in 2012-2016.")


--- Hypothesis Test Decision ---
Since p-value (0.0005) < alpha (0.05), we REJECT the Null Hypothesis.
Conclusion: There is statistically significant evidence that the average number of TV shows added per year in 2017-2021 is greater than in 2012-2016. 🎉
