In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Data Import

In [3]:
df = pd.read_csv('netflix_titles.csv', encoding='ISO-8859-1')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/netflix-movies-and-tv-shows/netflix_titles.csv'

## Data Cleaning

In [None]:
df = df.iloc[:, :10]
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

The dataset does not have duplicates but there are some missing values. We won't be using the columns with much missing values so it wouldn't be a problem.

## Exploratory Data Analysis

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x=df['type'], data=df, palette='colorblind')
plt.title('Number of Movies and TV Shows')
plt.show()

In [None]:
country_counts = df['country'].value_counts()
top_5_countries = country_counts.nlargest(5)
plt.figure(figsize=(10, 6))
sns.barplot(x=top_5_countries.index, y=top_5_countries.values, palette='colorblind')

plt.title('Top 5 countries by Number of shows (both Movies and TV Shows) produced')
plt.show()

In [None]:
# Plot for Movies
movies_df = df[df['type'] == 'Movie']

top_5_movies = movies_df['country'].value_counts().nlargest(5)

plt.figure(figsize=(10, 6))
sns.barplot(x=top_5_movies.index, y=top_5_movies.values, palette='colorblind')
plt.xlabel('Country')
plt.ylabel('Number of Movies Produced')
plt.title('Top 5 Countries by Number of Movies Produced')
plt.show()

In [None]:
# Plot for TV Shows
tv_shows_df = df[df['type'] == 'TV Show']

top_5_tv_shows = tv_shows_df['country'].value_counts().nlargest(5)

plt.figure(figsize=(10, 6))
sns.barplot(x=top_5_tv_shows.index, y=top_5_tv_shows.values, palette='colorblind')
plt.xlabel('Country')
plt.ylabel('Number of TV Shows Produced')
plt.title('Top 5 Countries by Number of TV Shows Produced')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
plt.hist(df['release_year'], bins=50, color='#0072B2', edgecolor='white')
plt.title('Release year of movies')
plt.show()

In [None]:
df.info()
movies_df['duration'] = movies_df['duration'].astype(str)

movies_df.loc[:, 'duration'] = pd.to_numeric(movies_df['duration'].str.extract(r'(\d+)')[0], errors='coerce')

movies_df.head()

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x=df['rating'], data=df, palette='colorblind')
plt.title('Distribution of Rating')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
plt.hist(movies_df['duration'], bins=50, color='#E69F00', edgecolor='white')
plt.title('Duration distribution of movies')
plt.show()