## Netfelx Titles (Movie and TV Show) Dataset

In [None]:
# necessary imports
import git
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns

In [None]:
# load the data
repo_path = git.Repo(".", search_parent_directories=True).working_tree_dir
data_path = repo_path + "/data/netflix_titles.csv"

df = pd.read_csv(data_path)
df.head()

### How does the number of titles change over the years?

In [None]:
# plot with plotly
fig = px.histogram(df, x="release_year", color="type", barmode="group")
fig.show()

### Busiest release months?

In [None]:
# find the busiest release months
df["date_added"] = pd.to_datetime(df["date_added"], format="mixed", errors="coerce")
df["month_added"] = df["date_added"].dt.month
df["month_name_added"] = df["date_added"].dt.month_name()

# plot with plotly
fig = px.histogram(df, x="month_name_added", color="type", barmode="group")
fig.show()

### Genres with the most titles?

In [None]:
df_genres = df["listed_in"].str.split(", ", expand=True).stack().value_counts()
df_genres = pd.DataFrame(df_genres)
df_genres.reset_index(inplace=True)
df_genres.columns = ["genre", "count"]

# plot with plotly
fig = px.bar(df_genres, x="genre", y="count")
fig.show()

In [None]:
# df['genres'] = df['listed_in'].str.split(', ')
# # now we need to explode the genres column
# df = df.explode('genres')

# # plot with plotly, sort by value counts
# fig = px.histogram(df, x='genres', color='type', barmode='group')
# fig.show()

In [None]:
df.head(2)

### Rating Analysis

In [None]:
# plot with plotly
fig = px.histogram(df, x="rating", barmode="group")
fig.show()