# **Import** the **Libraries**

In [206]:
import pandas as pd
import numpy as np
import plotly.express as px
from textblob import TextBlob


# **Data Collections**

In [207]:
df_netflix = pd.read_csv('netflix_titles.csv')
df_netflix.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


# Data information

In [208]:
df_netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


# THE COLUMNS IN THE DATASET

In [229]:
df_netflix.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

# **The Ratings on TV shows**

In [230]:
X = df_netflix.groupby(['rating']).size().reset_index(name='counts')
X.columns = ['rating', 'counts'] # Rename the columns
X

Unnamed: 0,rating,counts
0,66 min,1
1,74 min,1
2,84 min,1
3,G,41
4,NC-17,3
5,NR,80
6,PG,287
7,PG-13,490
8,R,799
9,TV-14,2160


In [231]:
piechart = px.pie(X, values='counts', names='rating', title='Distribution of Ratings on Netflix')
piechart.show()

# Obtaining the Directors in the dataset

In [232]:
director_list = df_netflix['director'].str.split(', ', expand=True).stack()
director_list

Unnamed: 0,Unnamed: 1,0
0,0,Kirsten Johnson
1,0,Director not Specified
2,0,Julien Leclercq
3,0,Director not Specified
4,0,Director not Specified
...,...,...
8802,0,David Fincher
8803,0,Director not Specified
8804,0,Ruben Fleischer
8805,0,Peter Hewitt


# Identifying the movies without directors and fill it with Director not specified

In [233]:
df_netflix['director']=df_netflix['director'].fillna('Director not Specified')
df_netflix.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Cast not Specified,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,Director not Specified,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,Director not Specified,Cast not Specified,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,Director not Specified,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


# **Changing the title to Directors**

In [234]:
director_list = pd.DataFrame()
#director_list.columns = ['Directors']
director_list

In [235]:
director_list = df_netflix['director'].str.split(', ', expand=True).stack().reset_index(level=1, drop=True).rename('Directors')
director_list

Unnamed: 0,Directors
0,Kirsten Johnson
1,Director not Specified
2,Julien Leclercq
3,Director not Specified
4,Director not Specified
...,...
8802,David Fincher
8803,Director not Specified
8804,Ruben Fleischer
8805,Peter Hewitt


In [236]:
director_list.column=['Directors']
director_list

Unnamed: 0,Directors
0,Kirsten Johnson
1,Director not Specified
2,Julien Leclercq
3,Director not Specified
4,Director not Specified
...,...
8802,David Fincher
8803,Director not Specified
8804,Ruben Fleischer
8805,Peter Hewitt


## Obtaining the number of movies and TV show the directors has

In [237]:
directors = director_list.groupby(director_list).size().reset_index(name='Total count')
directors.columns = ['Directors', 'Total count'] # Rename the columns
print(directors)

                Directors  Total count
0             A. L. Vijay            2
1            A. Raajdheep            1
2               A. Salaam            1
3         A.R. Murugadoss            2
4         Aadish Keluskar            1
...                   ...          ...
4989           Éric Warin            1
4990     Ísold Uggadóttir            1
4991  Óskar Thór Axelsson            1
4992     Ömer Faruk Sorak            3
4993         Şenol Sönmez            2

[4994 rows x 2 columns]


# Listing the directors and their total contents

In [238]:
directors = directors[directors.Directors != 'Director not Specified']
directors

Unnamed: 0,Directors,Total count
0,A. L. Vijay,2
1,A. Raajdheep,1
2,A. Salaam,1
3,A.R. Murugadoss,2
4,Aadish Keluskar,1
...,...,...
4989,Éric Warin,1
4990,Ísold Uggadóttir,1
4991,Óskar Thór Axelsson,1
4992,Ömer Faruk Sorak,3


# Sorting the directors according to the number of their contents

In [239]:
director = directors.sort_values(by='Total count', ascending=False)
director


Unnamed: 0,Directors,Total count
3750,Rajiv Chilaka,22
1907,Jan Suter,21
3801,Raúl Campos,19
4458,Suhas Kadav,16
2867,Marcus Raboy,16
...,...,...
635,Brandon Camp,1
2296,Juan Antin,1
2297,Juan Antonio de la Riva,1
2298,Juan Camilo Pinzon,1


# Analysing top 5 Directors

In [240]:
top5Directors = director.head()
top5Directors

Unnamed: 0,Directors,Total count
3750,Rajiv Chilaka,22
1907,Jan Suter,21
3801,Raúl Campos,19
4458,Suhas Kadav,16
2867,Marcus Raboy,16


In [241]:
top5Directors = top5Directors.sort_values(by='Total count', ascending=True)
barchart = px.bar(top5Directors, x='Total count', y='Directors', title='Top 5 Directors on Netflix')
barchart.show()

## **Analyzing Top 5 Actors on Netflix**

In [242]:
df_netflix['cast'] = df_netflix['cast'].fillna('Cast not Specified')
cast_df = pd.DataFrame()
cast_df = df_netflix['cast'].str.split(', ', expand=True).stack().reset_index(level=1, drop=True).rename('Cast')
cast_df = cast_df.to_frame()
cast_df.columns = ['Actors']
Top_Cast_Actors = cast_df.head()
Top_Cast_Actors


Unnamed: 0,Actors
0,Cast not Specified
1,Ama Qamata
1,Khosi Ngema
1,Gail Mabalane
1,Thabang Molaba


In [243]:
Actors = cast_df.groupby('Actors').size().reset_index(name='Total count') # Group by the 'Actors' column instead of the entire DataFrame
Actors = Actors[Actors.Actors != 'Cast not Specified']
Actors = Actors.sort_values(by='Total count', ascending=False)
top5Actors = Actors.head()
top5Actors = top5Actors.sort_values(by='Total count', ascending=True)
barchart = px.bar(top5Actors, x='Total count', y='Actors', title='Top 5 Actors on Netflix')
barchart.show()


## **Analyzing Content produced on Netflix based on the year**

In [247]:
df_netflix1 = df_netflix[['type', 'release_year']]  # Remove the leading space from ' release_year'
df_netflix1 = df_netflix1.rename(columns={'release_year': 'Release Year', 'type': 'Type'})  # Remove leading space here as well
df_netflix1 = df_netflix1.groupby(['Release Year', 'Type']).size().reset_index(name='Total count')
df_netflix1

Unnamed: 0,Release Year,Type,Total count
0,1925,TV Show,1
1,1942,Movie,2
2,1943,Movie,3
3,1944,Movie,3
4,1945,Movie,3
...,...,...,...
114,2019,TV Show,397
115,2020,Movie,517
116,2020,TV Show,436
117,2021,Movie,277


In [251]:
#df_netflix = df_netflix[df_netflix[' Release Year'] >= 2000]
df_netflix = df_netflix[df_netflix['release_year'] >= 2000]
graph = px.line(df_netflix1, x='Release Year', y='Total count', color='Type', title='Content produced on Netflix based on the year')
#                                           ^^^ Ensure 'Total count' is the correct column name
# Use df_netflix1 which has the 'Total count' column for the graph
graph.show()

## **SENTIMENT ANALYSIS ON NETFLIX CONTENTS**
### Analyzing which Country has the highest content produced

In [249]:
df2 = df_netflix[['release_year', 'description']] # Assuming your column is named 'release_year'

df2 = df2.rename(columns={'release_year': 'Release Year', 'description': 'Description'}) # Rename ' Release Year' to 'Release Year' and 'description' to 'Description'
for index, row in df2.iterrows():
    description = row['Description']
    testimonial = TextBlob(description)
    p = testimonial.sentiment.polarity
    if p == 0:
        sentiment = 'Neutral'
    elif p > 0:
        sentiment = 'Positive'
    else:
        sentiment = 'Negative'
    df2.at[index, 'Sentiment Score'] = p

df2 = df2.groupby(['Release Year', 'Sentiment Score']).size().reset_index(name='Total Count')
df2 = df2[df2['Release Year'] >= 2005]
df2 = df2.rename(columns={'Release Year': 'Release Year', 'Sentiment Score': 'Sentiment'})
barGraph = px.bar(df2, x='Release Year', y='Total Count', color='Sentiment', title='Sentiment Analysis on Netflix Contents')
barGraph.show()