In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import wordcloud

sns.set_style('darkgrid')

In [None]:
!git clone https://github.com/HarshvardhanSingh-13/Datasets.git

In [None]:
df = pd.read_csv('netflix_titles.csv')


In [None]:
df.head()

In [None]:
df.tail()



In [None]:
df['type'].value_counts()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# handing missing values for director and cast

In [None]:
df.isnull().sum()


In [None]:
df['director'] = df['director'].fillna('unknown')
df['cast'] = df['cast'].fillna('unknown')


In [None]:
df.isnull().sum()

In [None]:
 df['country'].value_counts()

In [None]:
mode_country = df['country'].mode()[0]
df['country'] = df['country'].fillna(mode_country)


In [None]:
df.isnull().sum()

In [None]:
#dropping the values of the date_added and the rating as these values are very low compare to the datasets value
df.dropna(subset = ['date_added' ,'rating'], inplace = True)

In [None]:
df.isnull().sum()

In [None]:
# convert the dat_added to datatime objects 

In [None]:
df['date_added'] = pd.to_datetime(df['date_added'],format='mixed',dayfirst=False)


In [None]:
df.head()
# now as it is seen the date_added is in the formatted matter 

In [None]:
df.info()

In [None]:
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month


In [None]:
df.head()

In [None]:
# EDA and Visualization
# what is the distribution of content type ? 
df['type'].value_counts().index

In [None]:
plt.figure(figsize=(8,6))
type_count = df['type'].value_counts()
plt.pie(type_count,labels=type_count.index, autopct='%1.1f%%', startangle=140, colors=['#e60023', '#221f1f'])
plt.title('Proportion of Movies vs. TV Shows')
plt.ylabel('')
plt.show()

In [None]:
# how the content is added over time 

In [None]:
content_over_time = df.groupby(['year_added','type']).size().unstack().fillna(0)

plt.figure(figsize=(12,6))
content_over_time.plot(kind='line', marker = 'o', figsize=(12,6))
plt.title('Content Added to Netflix Over the Years (by Type)')
plt.xlabel('Year Added')
plt.ylabel('Number of Titles Added')
plt.legend(title='Content Type')
plt.grid(True)
plt.show()


In [None]:
# What are the most popular genres?

In [None]:
df['listed_in'].value_counts()

In [None]:
# Split the 'listed_in' column and explode it
genres = df.assign(genre=df['listed_in'].str.split(', ')).explode('genre')

In [None]:
genres


In [None]:
top_genres_counts = genres['genre'].value_counts().reset_index()
top_genres_counts.columns = ['genre','count']

In [None]:
top_genres_counts 

In [None]:
top_genres_counts_plot = top_genres_counts.head(15)

plt.figure(figsize=(12,6))
sns.barplot(y='genre',x='count',data=top_genres_counts_plot,palette='mako',hue='genre',legend=False)
plt.title('Top 15 Genres on Netflix')
plt.xlabel('Count')
plt.ylabel('Genre')
plt.show()


In [None]:
# What is the distribution of content duration?


In [None]:
movies_df = df[df['type']== 'Movie'].copy()
tv_shows_df = df[df['type'] == 'TV Show'].copy()

In [None]:
# Clean and convert duration for movies
movies_df['duration_min'] = movies_df['duration'].str.replace(' min', '').astype(int)

# Clean and convert duration for TV shows
tv_shows_df['seasons'] = tv_shows_df['duration'].str.replace(' Seasons', '').str.replace(' Season', '').astype(int)

In [None]:
# Plot the distributions
fig, axes = plt.subplots(1, 2, figsize=(12, 6))

# Movie Duration Distribution
sns.histplot(ax=axes[0], data=movies_df, x='duration_min', bins=50, kde=True, color='skyblue').set_title('Movie Duration Distribution (minutes)')

# TV Show Season Distribution
sns.countplot(ax=axes[1], x='seasons', data=tv_shows_df, palette='rocket', order=tv_shows_df['seasons'].value_counts().index, hue='seasons', legend=False).set_title('TV Show Season Distribution')

plt.show()

In [None]:
# geographical analysis 

In [None]:
countries = df.assign(country = df['country'].str.split(',')).explode('country')

In [None]:
# Get the top 15 countries and their counts
top_countries_counts = countries['country'].value_counts().reset_index()
top_countries_counts.columns = ['country', 'count'] # Rename columns for clarity

In [None]:
top_countries_counts

In [None]:
plt.figure(figsize=(12, 8))
sns.countplot(x='rating', data=df, order=df['rating'].value_counts().index, palette='crest', hue='rating', legend=False)
plt.title('Distribution of Content Ratings on Netflix')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
df.head(2)

In [None]:
df['age_on_netflix'] = df['year_added'] - df['release_year']

content_age = df[df['age_on_netflix'] >= 0]

plt.figure(figsize=(12, 6))
sns.histplot(data=content_age, x='age_on_netflix', bins=50, kde=True)
plt.title('Distribution of Content Age When Added to Netflix')
plt.xlabel('Content Age (Years)')
plt.ylabel('Number of Titles')
plt.show()

In [None]:
# Analyze movie duration across different top genres
top_genres = genres['genre'].value_counts().index[:5]
genres_movies = genres[(genres['type'] == 'Movie') & (genres['genre'].isin(top_genres))].copy()
genres_movies['duration_min'] = genres_movies['duration'].str.replace(' min', '').astype(int)

plt.figure(figsize=(15, 8))
sns.boxplot(data=genres_movies, x='genre', y='duration_min', palette='pastel', hue='genre', legend=False)
plt.title('Movie Duration by Top Genres')
plt.xlabel('Genre')
plt.ylabel('Duration (minutes)')
plt.xticks(rotation=45)
plt.show()

In [None]:
from wordcloud import WordCloud

In [None]:
# Combine all descriptions into a single string
text = ' '.join(df['description'])

# Create and generate a word cloud image
wordcloud = WordCloud(width=800, height=400, background_color='black').generate(text)

# Display the generated image
plt.figure(figsize=(15, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Common Words in Netflix Content Descriptions', fontsize=20)
plt.show()

In [55]:
"""insight: The word cloud highlights common themes and subjects. Words like "life," "family," "love," "young," "friends," and "world" are prominent, suggesting that much of the content revolves around human relationships and personal journeys. Action-oriented words like "find," "secret," and "new" also appear frequently"""

'insight: The word cloud highlights common themes and subjects. Words like "life," "family," "love," "young," "friends," and "world" are prominent, suggesting that much of the content revolves around human relationships and personal journeys. Action-oriented words like "find," "secret," and "new" also appear frequently'

In [56]:
cd C:\Users\HP\21 days project\Day 2


C:\Users\HP\21 days project\Day 2


In [65]:
!git init
!git add README.md
!git commit -m "first commit"
!git branch -M main
!git remote add origin https://github.com/Raghavkumar099/Cracking-the-Code-An-Inside-Look-at-Netflix-s-Content-Strategy.git
!git push -u origin main

Reinitialized existing Git repository in C:/Users/HP/21 days project/Day 2/.git/


fatal: pathspec 'README.md' did not match any files


On branch main

Initial commit

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	 An Inside Look at Netflix's Content Strategy.ipynb
	.ipynb_checkpoints/
	git/
	netflix_titles.csv

nothing added to commit but untracked files present (use "git add" to track)


error: remote origin already exists.
error: src refspec main does not match any
error: failed to push some refs to 'https://github.com/Raghavkumar099/Cracking-the-Code-An-Inside-Look-at-Netflix-s-Content-Strategy'


In [66]:
!git remote add origin https://github.com/Raghavkumar099/Cracking-the-Code-An-Inside-Look-at-Netflix-s-Content-Strategy.git
!git branch -M main
!git push -u origin main

error: remote origin already exists.
error: src refspec main does not match any
error: failed to push some refs to 'https://github.com/Raghavkumar099/Cracking-the-Code-An-Inside-Look-at-Netflix-s-Content-Strategy'


In [67]:
!git add "An Inside Look at Netflix's Content Strategy.ipynb" netflix_titles.csv

fatal: pathspec 'An Inside Look at Netflix's Content Strategy.ipynb' did not match any files
