# Analysis - Movie Subset of Netflix Data

Enhanced by Rotten Tomatoes Dataset

#### Import Libraries

In [13]:
import pandas as pd
import plotly.express as px

Create DataFrame from Netflix CSV, filtered to just movies

Create DataFrame from Rotten Tomatoes Movies CSV

In [14]:
netflix_movies_df = pd.read_csv('./data/netflix-titles.csv')
netflix_movies_df = netflix_movies_df[netflix_movies_df["type"] == 'Movie']
rotten_movies_df = pd.read_csv('./data/rotten_tomatoes_movies.csv')

Extract release_year from Rotten Tomatoes original_release_date

In [15]:
rotten_movies_df["release_year"] = rotten_movies_df["original_release_date"].str.split('-').str[0]

Add Unique identifier column to both DataFrames for merging

In [16]:
netflix_movies_df["title_and_release_year"] = netflix_movies_df["title"].str.lower() + " " + netflix_movies_df["release_year"].astype(str)
rotten_movies_df["title_and_release_year"] = rotten_movies_df["movie_title"].str.lower() + " " + rotten_movies_df["release_year"].astype(str)

Merge the above datasets, returning only those that exist in both sets

In [17]:
netflix_plus_rt_df = pd.merge(netflix_movies_df, rotten_movies_df, how="inner", on="title_and_release_year")
netflix_plus_rt_df.describe()

Unnamed: 0,release_year_x,runtime,tomatometer_rating,tomatometer_count,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
count,1252.0,1232.0,1251.0,1251.0,1248.0,1248.0,1252.0,1252.0,1252.0
mean,2008.528754,107.604708,59.055156,102.459632,61.138622,466419.4,24.381789,64.58147,37.81869
std,11.077374,20.597857,28.775451,85.067835,20.222044,3310599.0,17.706812,71.982482,39.504686
min,1954.0,40.0,0.0,5.0,8.0,6.0,0.0,0.0,0.0
25%,2004.0,93.0,35.0,30.5,46.0,1461.5,8.0,13.0,7.0
50%,2012.0,103.0,63.0,80.0,63.0,26804.5,21.0,39.0,22.0
75%,2017.0,118.0,85.5,158.0,77.25,168564.8,39.0,89.0,59.0
max,2020.0,213.0,100.0,512.0,99.0,34679770.0,65.0,495.0,196.0


# Histogram - Netflix Catalogue of Movies by Release Year

Stacked by Rotten Tomatoes Tomatometer status
- ranges from Rotten to Certified Fresh

In [63]:
netflix_plus_rt_df = netflix_plus_rt_df.dropna(subset="tomatometer_status")
# px.histogram(netflix_plus_rt_df, "release_year_x", color="tomatometer_status", labels={"release_year_x": "Release Year"}, color_discrete_sequence=['red', 'blue', 'green'], title="Movies Per Year with Rotten Tomatoes Rating")
netflix_plus_rt_df = netflix_plus_rt_df.sort_values(by=["tomatometer_status", "release_year_x"], ascending=False)

px.histogram(
  netflix_plus_rt_df,
  "release_year_x",
  title="Movies Per Year with Rotten Tomatoes Rating",
  labels={"release_year_x": "Release Year"},
  color="tomatometer_status",
  color_discrete_map={
                "Rotten": "hsl(6, 78%, 66%)",
                "Fresh": "hsl(48, 89%, 60%)",
                "Certified-Fresh": "hsl(145, 61%, 59%)",
                },
  nbins=70
)

In [43]:
netflix_plus_rt_df = netflix_plus_rt_df.dropna(subset="tomatometer_status")
df_grouped = netflix_plus_rt_df.groupby("release_year_x").count().reset_index()
px.bar(df_grouped,
      x='release_year_x',
      y='title',
      title='Test',
      color='tomatometer_status',
      barmode="stack",
      color_discrete_map={
        "Rotten": "red",
        "Fresh": "yellow",
        "Certified-Fresh": "green",
        },
      labels={
        "release_year_x": "Release Year",
        "title": "Count",
        }
      )