In [None]:
import pandas as pd
import matplotlib.pyplot as plt

#load data
df = pd.read_csv('netflix.csv')

In [None]:
df.head(10)

In [None]:
df.info()

In [None]:
df.describe()

clean data

In [None]:

df = df.dropna(subset=['type','release_year','rating','country','duration'])
type_counts = df['type'].value_counts()

In [None]:

print(df.groupby(['release_year','type']).size().unstack())

Number of Movies vs tv shows

In [None]:

plt.figure(figsize=(6,4))
plt.bar(type_counts.index,type_counts.values,color = ['pink','grey'])
plt.title('Number of Movies vs tv shows')
plt.xlabel('Type')
plt.ylabel('count')
plt.tight_layout()

plt.show()

Precentage of content rating

In [None]:

rating_counts = df["rating"].value_counts()
plt.figure(figsize=(6,4))
plt.pie(rating_counts,labels=rating_counts.index,colors = ['pink','blue','coral','maroon','skyblue','green','teal','red','grey','black','silver','orange','purple','brown'],autopct='%1.1f%%',startangle=97)
plt.title('Precentage of content rating')
plt.tight_layout()

plt.show()

Distribution of movie duration

In [None]:

movie_df = df[df['type']=='Movie'].copy()
movie_df['duration_int'] = movie_df['duration'].str.replace('min','').astype(int)
plt.figure(figsize=(8,8))
bins = [80, 100,120,140,160,180]
plt.hist(movie_df['duration_int'],bins=bins,color='purple',edgecolor='white')
plt.xticks(bins)
plt.yticks([0,2,4,6,8,10,12,14,16,18,20])
plt.title('Distribution of movie duration')
plt.xlabel('duration')
plt.ylabel('number of movie')
plt.tight_layout()

plt.show()

Relese_year vs number of shows

In [None]:

release_counts = df["release_year"].value_counts().sort_index()
plt.figure(figsize=(10,6))
plt.scatter(release_counts.index,release_counts.values,color = 'red')
plt.title('relese_year vs number of shows')
plt.xlabel('release_year')
plt.ylabel('number of shows')
plt.tight_layout()

plt.show()

Ten countries by number of shows

In [None]:

country_counts = df["country"].value_counts().head(10)
plt.figure(figsize=(8,6))
plt.barh(country_counts.index,country_counts.values,color = "teal")
plt.title('Ten countries by number of shows')
plt.xlabel('rnumber of show')
plt.ylabel('contries')
plt.tight_layout()

plt.show()

Comparision of TV show and movies released over Years

In [None]:
content_by_year = df.groupby(['release_year','type']).size().unstack().fillna(0)
fig ,ax = plt.subplots(1,2,figsize=(12,5))

ax[0].plot(content_by_year.index,content_by_year["Movie"],color = "grey")
ax[0].set_title("Movies released per year")
ax[0].set_xlabel('Year')
ax[0].set_ylabel('Number of movies')

ax[1].plot(content_by_year.index,content_by_year["TV Show"],color = "orange")
ax[1].set_title("TV show released per year")
ax[1].set_xlabel('Year')
ax[1].set_ylabel('Number of TV show')
fig.suptitle("Comparision of TV show and movies released over Years")
plt.tight_layout()

plt.show()

# **Insights from Netflix Data Analysis**

## **Key Findings from the Netflix Dataset Analysis**

### **Data Overview**
The dataset contains information about Netflix shows and movies, including details like title, type (movie/TV show), director, cast, country, release year, rating, duration, genre, and description.

Initial data exploration shows **100 entries** with some missing values in columns like director, cast, and country.

### **Content Distribution**
The dataset shows a mix of movies and TV shows available on Netflix.

The bar plot visualization reveals the comparative count between movies and TV shows in the sample data.

### **Content Ratings**
The pie chart displays the percentage distribution of different content ratings (like TV-MA, PG-13, PG, etc.) in the dataset.

**TV-MA (Mature Audience)** content appears to be the most common rating in this sample.

### **Release Trends**
The histogram of release years shows how Netflix's content library is distributed across different years.

More recent years (like 2021) have a higher concentration of content, reflecting Netflix's growing library.

### **Data Cleaning**
The analysis includes steps to clean the data by handling missing values in key columns like type, release year, rating, country, and duration.

### **Recommendations for Further Analysis**
- **Content Origin Analysis:** Explore which countries produce the most Netflix content.
- **Genre Popularity:** Analyze which genres are most common and how they trend over time.
- **Director/Cast Analysis:** Examine which directors and actors appear most frequently.
- **Duration Patterns:** Investigate differences in movie lengths and TV show seasons.
- **Temporal Trends:** More detailed analysis of how content production has changed year-over-year.

### **Data Limitations**
- The dataset is a sample (100 entries) and may not represent Netflix's complete library.
- Missing values in some columns limit certain analyses.
- The timeframe of the data isn't specified, which could affect trend analysis.

This analysis provides a foundation for understanding Netflix's content strategy and could be expanded with more comprehensive data and additional visualization.
