In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df=pd.read_csv("netflix.csv")
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [4]:
#length of data
df.shape

(8807, 12)

In [5]:
#checking datatypes
df.dtypes

show_id         object
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
description     object
dtype: object

In [6]:
#number of unique values in our data
df.nunique()

show_id         8807
type               2
title           8807
director        4528
cast            7692
country          748
date_added      1767
release_year      74
rating            17
duration         220
listed_in        514
description     8775
dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [8]:
#checking null values in every column of our data
df.isnull().sum().sort_values(ascending=False)

director        2634
country          831
cast             825
date_added        10
rating             4
duration           3
show_id            0
type               0
title              0
release_year       0
listed_in          0
description        0
dtype: int64

In [9]:
#checking null values percentage in every column of our data
round(df.isnull().sum()/df.shape[0]*100,2).sort_values(ascending=False)

director        29.91
country          9.44
cast             9.37
date_added       0.11
rating           0.05
duration         0.03
show_id          0.00
type             0.00
title            0.00
release_year     0.00
listed_in        0.00
description      0.00
dtype: float64

In [10]:
#unnesting the directors column, i.e- creating separate lines for each director in a movie
constraint1=df['director'].apply(lambda x: str(x).split(', ')).tolist()
df_new1=pd.DataFrame(constraint1,index=df['title'])
df_new1=df_new1.stack()
df_new1=pd.DataFrame(df_new1.reset_index())
df_new1.rename(columns={0:'director'},inplace=True)
df_new1.drop(['level_1'],axis=1,inplace=True)
df_new1.head()

Unnamed: 0,title,director
0,Dick Johnson Is Dead,Kirsten Johnson
1,Blood & Water,
2,Ganglands,Julien Leclercq
3,Jailbirds New Orleans,
4,Kota Factory,


In [11]:
#unnesting the cast column, i.e- creating separate lines for each cast member in a movie
constraint2=df['cast'].apply(lambda x: str(x).split(', ')).tolist()
df_new2=pd.DataFrame(constraint2,index=df['title'])
df_new2=df_new2.stack()
df_new2=pd.DataFrame(df_new2.reset_index())
df_new2.rename(columns={0:'Actors'},inplace=True)
df_new2.drop(['level_1'],axis=1,inplace=True)
df_new2.head()

Unnamed: 0,title,Actors
0,Dick Johnson Is Dead,
1,Blood & Water,Ama Qamata
2,Blood & Water,Khosi Ngema
3,Blood & Water,Gail Mabalane
4,Blood & Water,Thabang Molaba


In [12]:
#unnesting the listed_in column, i.e- creating separate lines for each genre in a movie
constraint3=df['listed_in'].apply(lambda x: str(x).split(', ')).tolist()
df_new3=pd.DataFrame(constraint3,index=df['title'])
df_new3=df_new3.stack()
df_new3=pd.DataFrame(df_new3.reset_index())
df_new3.rename(columns={0:'Genre'},inplace=True)
df_new3.drop(['level_1'],axis=1,inplace=True)
df_new3.head()

Unnamed: 0,title,Genre
0,Dick Johnson Is Dead,Documentaries
1,Blood & Water,International TV Shows
2,Blood & Water,TV Dramas
3,Blood & Water,TV Mysteries
4,Ganglands,Crime TV Shows


In [13]:
#unnesting the country column, i.e- creating separate lines for each country in a movie
constraint4=df['country'].apply(lambda x: str(x).split(', ')).tolist()
df_new4=pd.DataFrame(constraint4,index=df['title'])
df_new4=df_new4.stack()
df_new4=pd.DataFrame(df_new4.reset_index())
df_new4.rename(columns={0:'country'},inplace=True)
df_new4.drop(['level_1'],axis=1,inplace=True)
df_new4.head()

Unnamed: 0,title,country
0,Dick Johnson Is Dead,United States
1,Blood & Water,South Africa
2,Ganglands,
3,Jailbirds New Orleans,
4,Kota Factory,India


In [14]:
#merging the unnested director data with unnested actors data
dfx=df_new1.merge(df_new2,on="title")

In [15]:
#merging the unnested genre data
dfx=dfx.merge(df_new3,on="title")

In [16]:
#merging the unnested country data
dfx=dfx.merge(df_new4,on="title")

In [17]:
dfx.head()

Unnamed: 0,title,director,Actors,Genre,country
0,Dick Johnson Is Dead,Kirsten Johnson,,Documentaries,United States
1,Blood & Water,,Ama Qamata,International TV Shows,South Africa
2,Blood & Water,,Ama Qamata,TV Dramas,South Africa
3,Blood & Water,,Ama Qamata,TV Mysteries,South Africa
4,Blood & Water,,Khosi Ngema,International TV Shows,South Africa


In [18]:
#merging our unnested data with the original data
df_final=df.merge(dfx,how='left',on='title')

In [19]:
df_final.head()

Unnamed: 0,show_id,type,title,director_x,cast,country_x,date_added,release_year,rating,duration,listed_in,description,director_y,Actors,Genre,country_y
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",Kirsten Johnson,,Documentaries,United States
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",,Ama Qamata,International TV Shows,South Africa
2,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",,Ama Qamata,TV Dramas,South Africa
3,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",,Ama Qamata,TV Mysteries,South Africa
4,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",,Khosi Ngema,International TV Shows,South Africa


In [20]:
df_final.shape

(201991, 16)

In [21]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 201991 entries, 0 to 201990
Data columns (total 16 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   show_id       201991 non-null  object
 1   type          201991 non-null  object
 2   title         201991 non-null  object
 3   director_x    151348 non-null  object
 4   cast          199845 non-null  object
 5   country_x     190094 non-null  object
 6   date_added    201833 non-null  object
 7   release_year  201991 non-null  int64 
 8   rating        201924 non-null  object
 9   duration      201988 non-null  object
 10  listed_in     201991 non-null  object
 11  description   201991 non-null  object
 12  director_y    201991 non-null  object
 13  Actors        201991 non-null  object
 14  Genre         201991 non-null  object
 15  country_y     201991 non-null  object
dtypes: int64(1), object(15)
memory usage: 26.2+ MB


In [22]:
df_final.drop(columns=['director_x', 'country_x', 'cast', 'listed_in'], axis=1, inplace=True)

In [23]:
df_final.head()

Unnamed: 0,show_id,type,title,date_added,release_year,rating,duration,description,director_y,Actors,Genre,country_y
0,s1,Movie,Dick Johnson Is Dead,"September 25, 2021",2020,PG-13,90 min,"As her father nears the end of his life, filmm...",Kirsten Johnson,,Documentaries,United States
1,s2,TV Show,Blood & Water,"September 24, 2021",2021,TV-MA,2 Seasons,"After crossing paths at a party, a Cape Town t...",,Ama Qamata,International TV Shows,South Africa
2,s2,TV Show,Blood & Water,"September 24, 2021",2021,TV-MA,2 Seasons,"After crossing paths at a party, a Cape Town t...",,Ama Qamata,TV Dramas,South Africa
3,s2,TV Show,Blood & Water,"September 24, 2021",2021,TV-MA,2 Seasons,"After crossing paths at a party, a Cape Town t...",,Ama Qamata,TV Mysteries,South Africa
4,s2,TV Show,Blood & Water,"September 24, 2021",2021,TV-MA,2 Seasons,"After crossing paths at a party, a Cape Town t...",,Khosi Ngema,International TV Shows,South Africa


In [24]:
new_column_names = {
    'director_y': 'director',
    'country_y': 'country',
    'Actors': 'actor',
    'Genre': 'genre'
}

df_final.rename(columns=new_column_names, inplace=True)

In [25]:
df_final.head()

Unnamed: 0,show_id,type,title,date_added,release_year,rating,duration,description,director,actor,genre,country
0,s1,Movie,Dick Johnson Is Dead,"September 25, 2021",2020,PG-13,90 min,"As her father nears the end of his life, filmm...",Kirsten Johnson,,Documentaries,United States
1,s2,TV Show,Blood & Water,"September 24, 2021",2021,TV-MA,2 Seasons,"After crossing paths at a party, a Cape Town t...",,Ama Qamata,International TV Shows,South Africa
2,s2,TV Show,Blood & Water,"September 24, 2021",2021,TV-MA,2 Seasons,"After crossing paths at a party, a Cape Town t...",,Ama Qamata,TV Dramas,South Africa
3,s2,TV Show,Blood & Water,"September 24, 2021",2021,TV-MA,2 Seasons,"After crossing paths at a party, a Cape Town t...",,Ama Qamata,TV Mysteries,South Africa
4,s2,TV Show,Blood & Water,"September 24, 2021",2021,TV-MA,2 Seasons,"After crossing paths at a party, a Cape Town t...",,Khosi Ngema,International TV Shows,South Africa


In [26]:
df_final.shape

(201991, 12)

In [27]:
#checking null values in every column of our data
df_final.isnull().sum().sort_values(ascending=False)

date_added      158
rating           67
duration          3
show_id           0
type              0
title             0
release_year      0
description       0
director          0
actor             0
genre             0
country           0
dtype: int64

In [28]:
#checking null values percentage in every column of our data
round(df_final.isnull().sum()/df.shape[0]*100,2).sort_values(ascending=False)

date_added      1.79
rating          0.76
duration        0.03
show_id         0.00
type            0.00
title           0.00
release_year    0.00
description     0.00
director        0.00
actor           0.00
genre           0.00
country         0.00
dtype: float64

In [29]:
def replace_nan(input_str):
    if input_str == "nan":
        return np.nan
    return input_str

In [30]:
df_final["director"]=df_final["director"].apply(replace_nan)

In [31]:
df_final["director"].isnull().sum()

50643

In [32]:
df_final["actor"]=df_final["actor"].apply(replace_nan)
df_final["genre"]=df_final["genre"].apply(replace_nan)
df_final["country"]=df_final["country"].apply(replace_nan)

In [33]:
df_final["actor"].isnull().sum()

2146

In [34]:
df_final["genre"].isnull().sum()

0

In [35]:
df_final["country"].isnull().sum()

11897

In [36]:
df_final["release_year"] = df_final["release_year"].astype(int)

In [37]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 201991 entries, 0 to 201990
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   show_id       201991 non-null  object
 1   type          201991 non-null  object
 2   title         201991 non-null  object
 3   date_added    201833 non-null  object
 4   release_year  201991 non-null  int32 
 5   rating        201924 non-null  object
 6   duration      201988 non-null  object
 7   description   201991 non-null  object
 8   director      151348 non-null  object
 9   actor         199845 non-null  object
 10  genre         201991 non-null  object
 11  country       190094 non-null  object
dtypes: int32(1), object(11)
memory usage: 19.3+ MB


In [38]:
df_final["duration"]=df_final["duration"].str.replace("min","")

In [39]:
df_final["duration"]=df_final["duration"].str.replace("Seasons","") # Replacing Seasons with empty string
df_final["duration"]=df_final["duration"].str.replace("Season","")

In [40]:
df_final["duration"].isnull().sum()

3

In [41]:
round(df_final.isnull().sum()/df_final.shape[0]*100,2).sort_values(ascending=False)

director        25.07
country          5.89
actor            1.06
date_added       0.08
rating           0.03
show_id          0.00
type             0.00
title            0.00
release_year     0.00
duration         0.00
description      0.00
genre            0.00
dtype: float64

In [42]:
# Replacing "NaN" values in country column with "Unknown_country"
df_final["country"].replace(np.NaN,"Unknown_country",inplace=True)  

In [43]:
# Replacing "NaN" values in director column with "Unknown_director"
df_final["director"].replace(np.NaN,"Unknown_director",inplace=True) 

In [44]:
# Replacing "NaN" values in actor column with "Unknown_actor"
df_final["actor"].replace(np.NaN,"Unknown_actor",inplace=True) 

In [45]:
df_final.dropna(subset=["rating","duration","date_added"],axis=0,inplace=True)

In [46]:
round(df_final.isnull().sum()/df_final.shape[0]*100,2).sort_values(ascending=False)

show_id         0.0
type            0.0
title           0.0
date_added      0.0
release_year    0.0
rating          0.0
duration        0.0
description     0.0
director        0.0
actor           0.0
genre           0.0
country         0.0
dtype: float64

In [47]:
df_final.head()

Unnamed: 0,show_id,type,title,date_added,release_year,rating,duration,description,director,actor,genre,country
0,s1,Movie,Dick Johnson Is Dead,"September 25, 2021",2020,PG-13,90,"As her father nears the end of his life, filmm...",Kirsten Johnson,Unknown_actor,Documentaries,United States
1,s2,TV Show,Blood & Water,"September 24, 2021",2021,TV-MA,2,"After crossing paths at a party, a Cape Town t...",Unknown_director,Ama Qamata,International TV Shows,South Africa
2,s2,TV Show,Blood & Water,"September 24, 2021",2021,TV-MA,2,"After crossing paths at a party, a Cape Town t...",Unknown_director,Ama Qamata,TV Dramas,South Africa
3,s2,TV Show,Blood & Water,"September 24, 2021",2021,TV-MA,2,"After crossing paths at a party, a Cape Town t...",Unknown_director,Ama Qamata,TV Mysteries,South Africa
4,s2,TV Show,Blood & Water,"September 24, 2021",2021,TV-MA,2,"After crossing paths at a party, a Cape Town t...",Unknown_director,Khosi Ngema,International TV Shows,South Africa


In [48]:
movies=df_final['type']=="movies"

In [49]:
movies

0         False
1         False
2         False
3         False
4         False
          ...  
201986    False
201987    False
201988    False
201989    False
201990    False
Name: type, Length: 201763, dtype: bool

In [50]:
# Remove duplicated rows from the DataFrame
df_final.duplicated.sum()

AttributeError: 'function' object has no attribute 'sum'

In [None]:
#Uni-variate Analysis

In [None]:
df_final.head()

In [None]:
df_final.nunique()

In [None]:
 df_type=df_final.groupby(['type']).agg({"title":"nunique"}).reset_index()
 plt.pie(df_type['title'],explode=(0.05,0.05), labels=df_type['type'],autopct='%.1f%%')
 plt.show()

In [None]:
#We have 70:30 ratio of Movies and TV Shows in our data

In [None]:
#date added
index=np.array(movies_release_in_same_year["date_added"].value_counts().index)
sorted_index=np.sort(index)
sorted_index

In [None]:
# Create a histogram of the release years
plt.figure(figsize=(10, 6))
plt.hist(df_final['release_year'], bins=20, color='skyblue', edgecolor='black')
plt.title('Distribution of Release Years')
plt.xlabel('Release Year')
plt.ylabel('Number of Titles')
plt.grid(axis='y', alpha=0.75)
plt.tight_layout()
plt.show()

In [None]:
# Create a KDE plot with a rug plot
plt.figure(figsize=(10, 6))
sns.kdeplot(data=df_final, x='release_year', fill=True, palette='viridis')
plt.title('Distribution of Release Years')
plt.xlabel('Release Year')
plt.ylabel('Density')
plt.tight_layout()
plt.show()

In [None]:
#6- Rating
# Calculate frequency counts of each rating
rating_counts = df_final['rating'].value_counts()

# Create a bar plot using Seaborn
plt.figure(figsize=(8, 6))
sns.set_style("whitegrid")
plot = sns.barplot(x=rating_counts.index, y=rating_counts.values, palette="viridis")
plt.title("Distribution of Ratings")
plt.xlabel("Rating")
plt.ylabel("Frequency")
plt.xticks(rotation=45)

# Annotate the count values on the bars
for index, value in enumerate(rating_counts):
    plot.text(index, value, str(value), ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()


In [None]:
#7- Duration

In [None]:
# Summary statistics
summary_stats = df_final['duration'].describe()
print("Summary Statistics for Duration:\n", summary_stats)

In [None]:
#8- Description

In [None]:
#9- Director
df_final["director"].value_counts().head(10)

In [None]:
#10- actor

In [None]:

# Perform univariate analysis on the "actor" column
actor_counts = df_final['actor'].value_counts()
actor_counts.plot(kind='bar', figsize=(10, 6), title='Actor Counts')

In [None]:
#11- Genre
#number of distinct titles on the basis of genre
df_final.groupby(['genre']).agg({"title":"nunique"})

In [None]:
df_genre=df_final.groupby(['genre']).agg({"title":"nunique"}).reset_index().sort_values(by=['title'],ascending=False)[:15]
plt.figure(figsize=(15,8))
plt.barh(df_genre[::-1]['genre'], df_genre[::-1]['title'])
plt.xlabel('Frequency of Genres')
plt.ylabel('Genres')
plt.show()

In [None]:
#country