In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [96]:
data = pd.read_csv("Netflix.csv")

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.head(5)

# **Cleaning Data - Filling NaN**
1. Director  - unknown director
2. Cast - unknown cast
3. country - unknown country
4. date_added - Unknown
5. rating - NR
6. Duration - 0
7. Date_added - leaving as NaN only

In [97]:
data.dropna(axis = 1, how = "all",inplace=True)
data["director"].fillna("unknown_director",inplace=True)
data["country"].fillna("unknown_country",inplace = True)
data["cast"].fillna("unknown_cast",inplace = True)
data["rating"].fillna("NR",inplace = True)
data["duration"].fillna("0 time",inplace=True)

In [None]:
data.info()

# **Cleaning Data - Unesting Columns**
1. Country
2. Cast
3. Listed_in

I just splitting them into Lists, will use explode later whenever needed






In [98]:
data["country"]=data["country"].str.split(",")
data["cast"]=data["cast"].str.split(",")
data["listed_in"]=data["listed_in"].str.split(",")

In [None]:
data.info()

# **Changing "Rating Columns" from cateogorical to Numerical :**

*   1 --> Very Child Friendly
*   9 --> Stricly adult Content
*   10 -->Not Rated , NAN, other data like 74 min,84min,66min









In [None]:
data["rating"].value_counts()

In [99]:
data["rating"]=data["rating"].replace(["TV-Y","TV-Y7","TV-Y7-FV","TV-G","G","TV-PG","PG","PG-13","TV-14","R","TV-MA","NC-17","NR","UR","74 min","84 min","66 min"],[1,2,3,4,4,5,5,6,7,8,9,9,10,10,10,10,10])

In [None]:
data.info()

# **Changing "Duration Columns" from Object to Int**

Ex:


*   TV Shows are given as seasons, so 2 Seasons --> 2
*   Movies are given as min, so 119 mins --> 119



In [100]:
data["duration"]=data["duration"].str.split(expand = True)[0]
data["duration"] = data["duration"].astype("int64")

In [None]:
data.info()

# **Changing Column "Date_added" Dtype to datetime**

In [101]:
data["date_added"]=data["date_added"].astype("datetime64")

# **Changing Column "release_year" Dtype to datetime**

*   As Only Year given in release_year , Assuming release happend at **1st Jan of Every Year**



In [102]:
data["release_year"]=pd.to_datetime(data["release_year"],format = "%Y")

In [105]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   show_id       8807 non-null   object        
 1   type          8807 non-null   object        
 2   title         8807 non-null   object        
 3   director      8807 non-null   object        
 4   cast          8807 non-null   object        
 5   country       8807 non-null   object        
 6   date_added    8797 non-null   datetime64[ns]
 7   release_year  8807 non-null   datetime64[ns]
 8   rating        8807 non-null   int64         
 9   duration      8807 non-null   int64         
 10  listed_in     8807 non-null   object        
 11  description   8807 non-null   object        
dtypes: datetime64[ns](2), int64(2), object(8)
memory usage: 825.8+ KB


In [106]:
data.describe()

Unnamed: 0,rating,duration
count,8807.0,8807.0
mean,7.043147,69.823095
std,2.167461,50.82252
min,1.0,0.0
25%,6.0,2.0
50%,7.0,88.0
75%,9.0,106.0
max,10.0,312.0


In [107]:
data.describe(include=object)

Unnamed: 0,show_id,type,title,director,cast,country,listed_in,description
count,8807,8807,8807,8807,8807,8807,8807,8807
unique,8807,2,8807,4529,7693,749,514,8775
top,s1,Movie,Dick Johnson Is Dead,unknown_director,[unknown_cast],[United States],"[Dramas, International Movies]","Paranormal activity at a lush, abandoned prope..."
freq,1,6131,1,2634,825,2818,362,4


In [108]:
data.describe(datetime_is_numeric=True)

Unnamed: 0,date_added,release_year,rating,duration
count,8797,8807,8807.0,8807.0
mean,2019-05-17 05:59:08.436967168,2014-03-07 16:27:24.873396224,7.043147,69.823095
min,2008-01-01 00:00:00,1925-01-01 00:00:00,1.0,0.0
25%,2018-04-06 00:00:00,2013-01-01 00:00:00,6.0,2.0
50%,2019-07-02 00:00:00,2017-01-01 00:00:00,7.0,88.0
75%,2020-08-19 00:00:00,2019-01-01 00:00:00,9.0,106.0
max,2021-09-25 00:00:00,2021-01-01 00:00:00,10.0,312.0
std,,,2.167461,50.82252


In [None]:
data["type"].value_counts()

In [None]:
sns.countplot(data = data,x="type")
plt.title("Count of Movies/Shows")
plt.show()

In [None]:
data["director"].value_counts()

In [None]:
A =data["director"].value_counts().reset_index()
sns.histplot(A["director"],bins=19)

In [None]:
data.explode("country")["country"].value_counts()

In [None]:
plt.figure(figsize=(40,8))
plt.xticks(rotation=90,fontsize = 10)
sns.countplot(data = data.explode("country"), x ="country" ,order =data.explode("country")["country"].value_counts().index )

In [None]:
A = data.explode("country")["country"].value_counts().reset_index().iloc[:20]
plt.figure(figsize=(40,8))
plt.xticks(rotation=45,fontsize = 20)
sns.barplot(data=A,x="index",y="country")

In [None]:
showdata = data[data["type"] == "TV Show"].reset_index()
moviedata = data[data["type"] == "Movie"].reset_index()

In [None]:
data.columns

In [None]:
plt.figure(figsize=(20,8))
sns.countplot(data= data["date_added"].dt.week.reset_index(), x="date_added")
plt.show()

In [None]:
data.explode("director").groupby("director")["title"].count().sort_values(ascending=False)