In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")


In [None]:
df = pd.read_csv('netflix_titles_nov_2019.csv.zip')
df.head()

In [None]:
df.info()
df.describe(include="all")
df.isnull().sum()


In [None]:
df['release_year'] = pd.to_numeric(df['release_year'], errors='coerce')


In [None]:
df['duration_int'] = df['duration'].astype(str).str.extract('(\d+)').astype(float)

In [None]:
corr = df[['release_year', 'duration_int']].corr()
corr


In [None]:
plt.figure(figsize=(6,4))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap (Netflix Dataset)")
plt.show()


In [None]:
sns.countplot(x='type', data=df)
plt.title("Movies vs TV Shows on Netflix")
plt.show()


In [None]:
df['country'].value_counts().head(10).plot(kind='bar')
plt.title("Top 10 Countries on Netflix")
plt.show()


In [None]:
df.info()

In [None]:
df['date_added'] = pd.to_datetime(df['date_added'])
df['year_added'] = df['date_added'].dt.year

df['year_added'].value_counts().sort_index().plot(kind='line')
plt.title("Content Added per Year")
plt.show()


In [None]:
sns.histplot(df['duration_int'].dropna(), bins=30)
plt.title("Movie Duration Distribution")
plt.show()


In [None]:
plt.figure(figsize=(6,4))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap (Netflix Dataset)")
plt.show()



In [None]:
import collections

genres = df['listed_in'].str.split(', ')
flattened = [g for sublist in genres.dropna() for g in sublist]
genre_counts = collections.Counter(flattened)

pd.Series(genre_counts).head(10).plot(kind='bar')
plt.title("Top 10 Genres on Netflix")
plt.show()


In [None]:
sns.scatterplot(x='release_year', y='duration_int', data=df)
plt.title("Release Year vs Movie Duration")
plt.show()


In [None]:
sns.countplot(y='rating', data=df, order=df['rating'].value_counts().index)
plt.title("Content Rating Distribution")
plt.show()



In [None]:
sns.boxplot(x='type', y='duration_int', data=df)
plt.title("Movie vs TV Show Duration Comparison")
plt.show()


In [None]:
sns.pairplot(df[['release_year', 'duration_int']])
