# What has Netflix already produced and how has it been received?

#### Import Libraries

In [128]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import slugify as slugify

Create DataFrame from Netflix CSV, filtered to just movies

Create DataFrame from Rotten Tomatoes Movies CSV

In [103]:
netflix_movies_df = pd.read_csv('./data/netflix-titles.csv', parse_dates=['date_added'])
netflix_movies_df = netflix_movies_df[netflix_movies_df["type"] == 'Movie']
rotten_movies_df = pd.read_csv('./data/rotten_tomatoes_movies.csv')

Add Unique identifier column (Title and first listed Director slugified) to both DataFrames for merging

In [104]:
netflix_movies_df["title_and_first_director"] = netflix_movies_df["title"] + " " + netflix_movies_df["director"].str.split(',').str[0].astype(str)
netflix_movies_df["title_and_first_director"] = netflix_movies_df["title_and_first_director"].apply(slugify.slugify)

rotten_movies_df["title_and_first_director"] = rotten_movies_df["movie_title"] + " " + rotten_movies_df["directors"].str.split(',').str[0].astype(str)
rotten_movies_df["title_and_first_director"] = rotten_movies_df["title_and_first_director"].apply(slugify.slugify)

Merge the above datasets, keeping the entire Netflix set and dumping the Rotten tomatoes that don't match the Netflix Set

In [105]:
netflix_plus_rt_df = pd.merge(netflix_movies_df, rotten_movies_df, how="left", on="title_and_first_director", indicator=True)

mapping_dict = {
  "left_only": False,
  "both": True,
}

netflix_plus_rt_df.rename(columns={"_merge":"matched_to_rt"}, inplace=True)
netflix_plus_rt_df.replace({"matched_to_rt": mapping_dict}, inplace=True)

Get the Production Companies from the Netflix Data to find out how they refer to in-house production

In [106]:
netflix_production_companies = ["Netflix", "Netflix Originals"]
netflix_oc_df = netflix_plus_rt_df[(netflix_plus_rt_df["production_company"] == "Netflix") | (netflix_plus_rt_df["production_company"] == "Netflix Originals")]
print(f"Netflix Originals: {len(netflix_oc_df.index)}")
netflix_oc_df.describe()

Netflix Originals: 274


Unnamed: 0,release_year,runtime,tomatometer_rating,tomatometer_count,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
count,274.0,203.0,271.0,271.0,251.0,251.0,274.0,274.0,274.0
mean,2017.985401,101.315271,71.077491,41.863469,60.227092,1068.976096,9.587591,31.222628,10.259124
std,1.286648,20.706079,27.606586,57.603946,21.009337,3119.838769,10.335117,51.8018,15.146005
min,2013.0,26.0,0.0,5.0,5.0,8.0,0.0,0.0,0.0
25%,2017.0,90.0,50.0,8.5,46.0,114.0,2.0,6.0,1.0
50%,2018.0,100.0,80.0,23.0,61.0,317.0,6.0,14.0,5.0
75%,2019.0,111.0,95.0,50.0,77.0,907.0,14.0,35.0,13.0
max,2020.0,209.0,100.0,439.0,100.0,40887.0,56.0,421.0,99.0


### Next Steps

Production Company data comes from RT - improving the merge may increase the size of this dataset.

For now, analysis can begin on these 135 movies

In [107]:
netflix_oc_df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description',
       'title_and_first_director', 'rotten_tomatoes_link', 'movie_title',
       'movie_info', 'critics_consensus', 'content_rating', 'genres',
       'directors', 'authors', 'actors', 'original_release_date',
       'streaming_release_date', 'runtime', 'production_company',
       'tomatometer_status', 'tomatometer_rating', 'tomatometer_count',
       'audience_status', 'audience_rating', 'audience_count',
       'tomatometer_top_critics_count', 'tomatometer_fresh_critics_count',
       'tomatometer_rotten_critics_count', 'matched_to_rt'],
      dtype='object')

| Column                             | Useful? | Note                                                                                                      | Insight from Graph                 |
| ---------------------------------- | ------- | --------------------------------------------------------------------------------------------------------- | ---------------------------------- |
| 'production_company'               | No      | all currently are Netflix/Netflix Originals                                                               |                                    |
| 'show_id'                          | No      |                                                                                                           |                                    |
| 'type'                             | No      | All are 'Movie'                                                                                           |                                    |
| 'title'                            |         |                                                                                                           |                                    |
| 'movie_title'                      | No      | same as above                                                                                             |                                    |
| 'director'                         |         | directors frequently used/not-used? check against RT                                                      |                                    |
| 'cast'                             |         | strings of actors (any that NF lean on/underuse?)                                                         |                                    |
| 'country'                          |         | of production                                                                                             |                                    |
| 'date_added'                       |         |                                                                                                           |                                    |
| 'release_year_x'                   |         |                                                                                                           |                                    |
| 'release_year_y'                   | no      | same as above (as these are matched on slug)                                                              |                                    |
| 'rating'                           |         | group by guidance rating to see where they already skew                                                   | Most are TV-MA                     |
| 'content_rating'                   |         |                                                                                                           | Most are not rated                 |
| 'duration'                         |         | Has a string at the end so not helpful                                                                    |                                    |
| 'runtime'                          |         |                                                                                                           |                                    |
| 'listed_in'                        |         | NF categories                                                                                             |                                    |
| 'description'                      | no      |                                                                                                           |                                    |
| 'movie_info'                       | no      | as description                                                                                            |                                    |
| 'title_and_release_year'           | no      | slug                                                                                                      |                                    |
| 'rotten_tomatoes_link'no           |         |                                                                                                           |                                    |
| 'critics_consensus'                | no      | general text                                                                                              |                                    |
| 'genres'                           |         | Need to split genres into individual genres                                                               |                                    |
| 'directors'                        |         | Need to split directors into individual directors                                                         |                                    |
| 'authors'                          |         | Need to split authors into individual authors                                                             |                                    |
| 'actors'                           |         | Need to split actors into individual actors                                                               |                                    |
| 'original_release_date'            |         |                                                                                                           |                                    |
| 'streaming_release_date'           |         |                                                                                                           |                                    |
| 'tomatometer_status'               | Maybe   | String, might be useful for broad stroke when tomatometer rating not used                                 |                                    |
| 'tomatometer_rating'               |         |                                                                                                           |                                    |
| 'tomatometer_count'                |         |                                                                                                           | Most Netflix OC is not rated on RT |
| 'audience_status'                  | Maybe   | String, might be useful for broad stroke when audience rating not used                                    |                                    |
| 'audience_rating'                  |         |                                                                                                           |                                    |
| 'audience_count'                   |         | Is this a count of people who have voted on RT rather than a viewing figure? Still need to have viewed it | Most Netflix OC is not rated on RT |
| 'tomatometer_top_critics_count'    |         |                                                                                                           | Most Netflix OC is not rated on RT |
| 'tomatometer_fresh_critics_count'  |         |                                                                                                           | Most Netflix OC is not rated on RT |
| 'tomatometer_rotten_critics_count' |         |                                                                                                           | Most Netflix OC is not rated on RT |
| 'matched_to_rt'                    | No      | All are already matched                                                                                   |                                    |

In [108]:
useful_columns = [
  'date_added',
  'release_year',
  'rating',
  'duration',
  'content_rating',
  'genres',
  'directors',
  'authors',
  'actors',
  'original_release_date',
  'streaming_release_date',
  'runtime',
  'tomatometer_status',
  'tomatometer_rating',
  'tomatometer_count',
  'audience_status',
  'audience_rating',
  'audience_count',
  'tomatometer_top_critics_count',
  'tomatometer_fresh_critics_count',
  'tomatometer_rotten_critics_count',
],

# for column_name in useful_columns:

column_name = useful_columns[0][0]
print(column_name)

for column_name in useful_columns[0]:
  sub_df = netflix_oc_df.sort_values(column_name, ascending=False)
  px.histogram(
      netflix_oc_df,
      column_name,
      title=f"Count of {column_name}",
      width=1000,
    ).show()
  

date_added


One of the Netflix Originals has an audience_count of 19000+, what is it?

Turns out it's [Bright (2017)](https://www.imdb.com/title/tt5519340/?ref_=nv_sr_srsg_0)

In [109]:
netflix_oc_df[netflix_oc_df["audience_count"] > 19000]

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,...,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count,matched_to_rt
2290,s3560,Movie,Dave Chappelle: Sticks & Stones,Stan Lathan,Dave Chappelle,United States,2019-08-26,2019,TV-MA,66 min,...,Rotten,35.0,17.0,Upright,99.0,40887.0,3.0,6.0,11.0,True
3289,s5114,Movie,Bright,David Ayer,"Will Smith, Joel Edgerton, Noomi Rapace, Lucy ...",United States,2017-12-22,2017,TV-MA,118 min,...,Rotten,28.0,109.0,Upright,83.0,19445.0,22.0,30.0,79.0,True


### Taking a look at the Actors that Appear in Netflix Originals

In [110]:
expand_actors = netflix_oc_df["actors"].str.split(",", expand=True, )
actor_count = netflix_oc_df.copy()
actor_count = pd.concat([actor_count, expand_actors], axis=1)
actor_count = actor_count.melt(id_vars=["show_id"], value_vars=range(62), value_name="Actor")
actor_count = actor_count[actor_count["Actor"].notna()]
actor_count.drop("variable", axis=1, inplace=True)
actor_count["Actor"] = actor_count["Actor"].str.strip()
actor_count['Appearances'] = actor_count.groupby('Actor')['Actor'].transform('count')

In [111]:
px.histogram(
    actor_count,
    x="Actor",
    title=f"Count of {column_name}",
    width=1000,
).update_xaxes(
  categoryorder="total descending"
).show()
  

### Investigating the Genres of current Netflix OC

Expand Genres into columns then melt back together

In [112]:
netflix_oc_df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description',
       'title_and_first_director', 'rotten_tomatoes_link', 'movie_title',
       'movie_info', 'critics_consensus', 'content_rating', 'genres',
       'directors', 'authors', 'actors', 'original_release_date',
       'streaming_release_date', 'runtime', 'production_company',
       'tomatometer_status', 'tomatometer_rating', 'tomatometer_count',
       'audience_status', 'audience_rating', 'audience_count',
       'tomatometer_top_critics_count', 'tomatometer_fresh_critics_count',
       'tomatometer_rotten_critics_count', 'matched_to_rt'],
      dtype='object')

In [113]:
expand_genres = netflix_oc_df["genres"].str.split(",", expand=True, )
nf_genre_count = netflix_oc_df.copy()
nf_genre_count = pd.concat([nf_genre_count, expand_genres], axis=1)
nf_genre_count = nf_genre_count.melt(id_vars=["show_id"], value_vars=range(4), value_name="genre")
nf_genre_count = nf_genre_count[nf_genre_count["genre"].notna()]
nf_genre_count.drop("variable", axis=1, inplace=True)
nf_genre_count["genre"] = nf_genre_count["genre"].str.strip()
nf_genre_count['films_of_genre'] = nf_genre_count.groupby('genre')['genre'].transform('count')

Get list of unique genres (useful for consistent sorting later)

In [114]:
categories = nf_genre_count["genre"].unique()
categories = categories.tolist()
category_colours = {
  'Drama':'barry',
  'Fri':'cyan',
  'Sat':'royalblue',
  'Sun':'darkblue'
  }

In [121]:
px.histogram(
  nf_genre_count,
  "genre",
  width=800,
  labels={
    "genre": "Genre",
  }
).update_xaxes(
  categoryorder="total descending"
)

# add trace?

Many of the 'Comedy' genre appear to be stand-up specials (from browsing the data - not visualised)
- cheap to produce?
- would be good to separate out exactly how many are specials vs. feature films
  - are they worth making? There's clearly already a lot of them

In [116]:
pie_fig = px.pie(
  nf_genre_count,
  "genre",
  title="Breakdown of Netflix OC by Genre",
  width=600,
  color_discrete_map=category_colours
)
pie_fig.update_traces(
  textposition='inside',
  textinfo='percent+label'
)

pie_fig.show()

Compare to Rotten Tomatoes Dataset Overall (including NF)

In [132]:
expand_genres = rotten_movies_df["genres"].str.split(",", expand=True, )
rt_genre_count = rotten_movies_df.copy()
rt_genre_count = pd.concat([rt_genre_count, expand_genres], axis=1)
rt_genre_count = rt_genre_count.melt(id_vars=["rotten_tomatoes_link"], value_vars=range(4), value_name="genre")
rt_genre_count = rt_genre_count[rt_genre_count["genre"].notna()]
rt_genre_count.drop("variable", axis=1, inplace=True)
rt_genre_count["genre"] = rt_genre_count["genre"].str.strip()
rt_genre_count['films_of_genre'] = rt_genre_count.groupby('genre')['genre'].transform('count')

pie_fig = px.pie(
  rt_genre_count,
  "genre",
  title="Breakdown of Rotten Tomatoes Movies by Genre",
  width=600,
  color_discrete_map=category_colours
)
pie_fig.update_traces(
  textposition='inside',
  textinfo='percent+label'
)

pie_fig.show()

Would this be better as a grouped bar chart?

Descending totals for RT content and then Netlfix aligned by genre

In [None]:
px.histogram(
  rt_genre_count,
  "genre",
  width=800,
  labels={
    "genre": "Genre",
  }
).update_xaxes(
  categoryorder="total descending"
)

# add trace?

In [153]:
rt_genre_with_count = rt_genre_count.drop_duplicates(subset="genre", keep='first')
rt_genre_with_count = rt_genre_with_count.drop(labels="rotten_tomatoes_link", axis=1)

nf_genre_with_count = nf_genre_count.drop_duplicates(subset="genre", keep='first')
nf_genre_with_count = nf_genre_with_count.drop(labels="show_id", axis=1)

fig = px.bar(
   barmode='group',
   title="Movies By Genre"
)
fig.update_xaxes(
  categoryorder="total descending"
)
fig.add_trace(
  go.Bar(
    x=rt_genre_with_count["genre"],
    y=rt_genre_with_count["films_of_genre"],
    name="Rotten Tomatoes"
  )
)
fig.add_trace(
  go.Bar(
    x=nf_genre_with_count["genre"],
    y=nf_genre_with_count["films_of_genre"],
    name="Netflix"
  )
)
fig.show()

#### This needs to be normalised

Need some sort of normalisation between these

Find the percentage of films within each set?


In [169]:
rt_total_films = len(rt_genre_count.index)
print(rt_total_films)
rt_genre_with_count = rt_genre_count.drop_duplicates(subset="genre", keep='first')
rt_genre_with_count = rt_genre_with_count.drop(labels="rotten_tomatoes_link", axis=1)
rt_genre_with_count["percentage_of_total"] = rt_genre_with_count["films_of_genre"] / rt_total_films * 100

nf_total_films = len(nf_genre_count.index)
print(nf_total_films)
nf_genre_with_count = nf_genre_count.drop_duplicates(subset="genre", keep='first')
nf_genre_with_count = nf_genre_with_count.drop(labels="show_id", axis=1)
nf_genre_with_count["percentage_of_total"] = nf_genre_with_count["films_of_genre"] / nf_total_films * 100


fig = px.bar(
  orientation="h",
  barmode='group',
  title="Movies By Genre"
)
fig.update_yaxes(
  categoryorder="total descending"
)
fig.add_trace(
  go.Bar(
    y=rt_genre_with_count["genre"],
    x=rt_genre_with_count["percentage_of_total"],
    name="Rotten Tomatoes",
    orientation="h"
  )
)
fig.add_trace(
  go.Bar(
    y=nf_genre_with_count["genre"],
    x=nf_genre_with_count["percentage_of_total"],
    name="Netflix",
    orientation="h"
  )
)
fig.show()

38902
408


In [196]:

rt_genre_with_count.sort_values("percentage_of_total", ascending=True, inplace=True)
# nf_genre_with_count.sort_values("percentage_of_total", ascending=True, inplace=True)

# Creating instance of the figure
fig = go.Figure()

# Adding Rotten Tomatoes data to the figure
fig.add_trace(
  go.Bar(
    y=rt_genre_with_count["genre"],
    x=rt_genre_with_count["percentage_of_total"] * -1,
    name="Rotten Tomatoes",
    orientation = 'h'
  )
)
# Adding Netflix data to the figure
fig.add_trace(
  go.Bar(
    y=nf_genre_with_count["genre"],
    x=nf_genre_with_count["percentage_of_total"],
    name="Netflix",
    orientation = 'h'
  )
)
  
# Updating the layout for our graph
fig.update_layout(
  title = 'Genre Distribution of Movies',
  height=700,
  title_font_size = 22,
  barmode = 'relative',
  bargap = 0.0,
  bargroupgap = 0,
  xaxis = dict(
    tickvals = [-30, -20, -10, 0, 10, 20, 30],
    ticktext = ['30%', '20%', '10%', '0%', '10%', '20%', '30%'],
    title = 'Share of Catalogue (%)',
    title_font_size = 20
  ),
  yaxis = dict(
    title_font_size = 18
  )
)

# fig.update_yaxes(
#   # categoryorder="total descending",
#   categoryorder="array",
#   categoryarray= categories
# )
 
fig.show()