In [43]:
import numpy as np
import pandas as pd
import plotly.express as px

In [50]:
pip install textblob

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [45]:
from textblob import TextBlob

In [46]:
df = pd.read_csv("netflix_titles.csv")
df.shape

(8807, 12)

In [47]:
df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [48]:
z = df.groupby(['rating']).size().reset_index(name='counts')

In [49]:
piee = px.pie(z, values = 'counts', names='rating', title='Distribution of Content Ratings',
             color_discrete_sequence=px.colors.qualitative.Set3)
piee.show()

The graph above shows that the majority of content on Netflix is categorized as "TV-MA", which means most of the content available is intented for viewing by mature and adult audiences.

In [13]:
#Top 5 Directors and Actors

In [14]:
df['director'] = df['director'].fillna('No Director Specified')
filter_dir = pd.DataFrame()
filter_dir = df['director'].str.split(',',expand=True).stack()
filter_dir = filter_dir.to_frame()
filter_dir.columns=['Director']

In [17]:
directors=filter_dir.groupby(["Director"]).size().reset_index(name='Total Content')
directors=directors[directors.Director != 'No Director Specified']
directors=directors.sort_values(by=['Total Content'],ascending=False)
directorsTop5=directors.head()
directorsTop5=directorsTop5.sort_values(by=['Total Content'])
fig = px.bar(directorsTop5,x='Total Content',y='Director',title ='Top 5 directors')
fig.show()

### From the above graph the Top 5 directors are:
    1.Rajiv Chilaka
    2.Jan Suter
    3.Raul Campos
    4.Marcus Raboy
    5.Suhas kadav

In [18]:
# Top 5 Actors

In [21]:
df['cast']=df['cast'].fillna('No Cast Specified')
filtered_cast=pd.DataFrame()
filtered_cast=df['cast'].str.split(',',expand=True).stack()
filtered_cast=filtered_cast.to_frame()
filtered_cast.columns=['Actor']
actors=filtered_cast.groupby(['Actor']).size().reset_index(name='Total Content')
actors=actors[actors.Actor !='No Cast Specified']
actors=actors.sort_values(by=['Total Content'],ascending=False)
actorsTop5=actors.head()
actorsTop5=actorsTop5.sort_values(by=['Total Content'])
fig2=px.bar(actorsTop5,x='Total Content',y='Actor', title='Top 5 Actors')
fig2.show()

### From above plot, the Top 5 Actors on Netfilx are:
    1. Anupam Kher
    2.Rupa Bhimani
    3.Takahiro Sakurai
    4.Julie Tejwani
    5.Om Puri

In [22]:
#Analysing Content On Netflix

In [40]:
# Renaming columns for clarity
df1 = df[['type', 'release_year']].rename(columns={'release_year': 'Release Year'})

# Grouping by 'Release Year' and 'type', counting total content
df2 = df1.groupby(['Release Year', 'type']).size().reset_index(name='Total Content')

# Filtering data for release years >= 2010
df2_filtered = df2[df2['Release Year'] >= 2010]

# Plotting using Plotly Express
fig = px.line(df2_filtered, x='Release Year', y='Total Content', color='type', 
              title='Trend of Content Produced Over the Years')

# Show the plot
fig.show()

The above line graph shows that there was a decline in the production of both movies and other tv shows since 2018

In [37]:
#Sentiment of content on Netflix

In [39]:
dfx = df[['release_year', 'description']].rename(columns={'release_year': 'Release Year'})

# Function to determine sentiment
def get_sentiment(text):
    testimonial = TextBlob(text)
    polarity = testimonial.sentiment.polarity
    if polarity == 0:
        return 'Neutral'
    elif polarity > 0:
        return 'Positive'
    else:
        return 'Negative'

# Apply sentiment analysis and create 'Sentiment' column
dfx['Sentiment'] = dfx['description'].apply(get_sentiment)

# Grouping by 'Release Year' and 'Sentiment', counting total content
dfx_grouped = dfx.groupby(['Release Year', 'Sentiment']).size().reset_index(name='Total Content')

# Filtering data for release years >= 2010
dfx_filtered = dfx_grouped[dfx_grouped['Release Year'] >= 2010]

# Plotting using Plotly Express
fig = px.bar(dfx_filtered, x='Release Year', y='Total Content', color='Sentiment', 
             title='Sentiment of Content by Release Year on Netflix')

# Show the plot
fig.show()

So the above graph shows that the overall positive content is always greater than the netural and negative content combined.