In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [58]:
data = pd.read_csv("Netflix.csv")

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.describe(include = object)

In [None]:
data.head(5)

In [None]:
data.info()

# **Cleaning Data - Filling NaN**
1. Director  - unknown director
2. Cast - unknown cast
3. country - unknown country
4. date_added - Unknown
5. rating - NR
6. Duration - 0
7. Date_added - leaving as NaN only

In [59]:
data.dropna(axis = 1, how = "all",inplace=True)
data["director"].fillna("unknown_director",inplace=True)
data["country"].fillna("unknown_country",inplace = True)
data["cast"].fillna("unknown_cast",inplace = True)
data["rating"].fillna("NR",inplace = True)
data["duration"].fillna("0 time",inplace=True)

In [60]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      8807 non-null   object
 4   cast          8807 non-null   object
 5   country       8807 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8807 non-null   object
 9   duration      8807 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


# **Cleaning Data - Unesting Columns**
1. Country
2. Cast
3. Listed_in

I just splitting them into Lists, will use explode later whenever needed






In [61]:
data["country"]=data["country"].str.split(",")
data["cast"]=data["cast"].str.split(",")
data["listed_in"]=data["listed_in"].str.split(",")

In [62]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      8807 non-null   object
 4   cast          8807 non-null   object
 5   country       8807 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8807 non-null   object
 9   duration      8807 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


# **Changing "Rating Columns" from cateogorical to Numerical :**

*   1 --> Very Child Friendly
*   9 --> Stricly adult Content
*   10 -->Not Rated , NAN, other data like 74 min,84min,66min









In [None]:
data["rating"].value_counts()

In [67]:
data["rating"]=data["rating"].replace(["TV-Y","TV-Y7","TV-Y7-FV","TV-G","G","TV-PG","PG","PG-13","TV-14","R","TV-MA","NC-17","NR","UR","74 min","84 min","66 min"],[1,2,3,4,4,5,5,6,7,8,9,9,10,10,10,10,10])

In [None]:
data.info()

# **Changing "Duration Columns" from Object to Int**

Ex:


*   TV Shows are given as seasons, so 2 Seasons --> 2
*   Movies are given as min, so 119 mins --> 119



In [None]:
data["duration"]=data["duration"].str.split(expand = True)[0]
data["duration"] = data["duration"].astype("int64")

In [76]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      8807 non-null   object
 4   cast          8807 non-null   object
 5   country       8807 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8807 non-null   int64 
 9   duration      8807 non-null   int64 
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
 12  numrating     8807 non-null   int64 
dtypes: int64(4), object(9)
memory usage: 894.6+ KB


# **Changing Column "Date_added" Dtype to datetime**

In [77]:
data["date_added"]=data["date_added"].astype("datetime64")

In [None]:
data.info()

In [79]:
data [data["date_added"].isna()]

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,numrating
6066,s6067,TV Show,A Young Doctor's Notebook and Other Stories,unknown_director,"[Daniel Radcliffe, Jon Hamm, Adam Godley, C...",[United Kingdom],NaT,2013,9,2,"[British TV Shows, TV Comedies, TV Dramas]","Set during the Russian Revolution, this comic ...",9
6174,s6175,TV Show,Anthony Bourdain: Parts Unknown,unknown_director,[Anthony Bourdain],[United States],NaT,2018,5,5,[Docuseries],This CNN original series has chef Anthony Bour...,5
6795,s6796,TV Show,Frasier,unknown_director,"[Kelsey Grammer, Jane Leeves, David Hyde Pie...",[United States],NaT,2003,5,11,"[Classic & Cult TV, TV Comedies]",Frasier Crane is a snooty but lovable Seattle ...,5
6806,s6807,TV Show,Friends,unknown_director,"[Jennifer Aniston, Courteney Cox, Lisa Kudro...",[United States],NaT,2003,7,10,"[Classic & Cult TV, TV Comedies]",This hit sitcom follows the merry misadventure...,7
6901,s6902,TV Show,Gunslinger Girl,unknown_director,"[Yuuka Nanri, Kanako Mitsuhashi, Eri Sendai,...",[Japan],NaT,2008,7,2,"[Anime Series, Crime TV Shows]","On the surface, the Social Welfare Agency appe...",7
7196,s7197,TV Show,Kikoriki,unknown_director,[Igor Dmitriev],[unknown_country],NaT,2010,1,2,[Kids' TV],A wacky rabbit and his gang of animal pals hav...,1
7254,s7255,TV Show,La Familia P. Luche,unknown_director,"[Eugenio Derbez, Consuelo Duval, Luis Manuel...",[United States],NaT,2012,7,3,"[International TV Shows, Spanish-Language TV ...","This irreverent sitcom featues Ludovico, Feder...",7
7406,s7407,TV Show,Maron,unknown_director,"[Marc Maron, Judd Hirsch, Josh Brener, Nora...",[United States],NaT,2016,9,4,[TV Comedies],"Marc Maron stars as Marc Maron, who interviews...",9
7847,s7848,TV Show,Red vs. Blue,unknown_director,"[Burnie Burns, Jason Saldaña, Gustavo Sorola...",[United States],NaT,2015,10,13,"[TV Action & Adventure, TV Comedies, TV Sci-...","This parody of first-person shooter games, mil...",10
8182,s8183,TV Show,The Adventures of Figaro Pho,unknown_director,"[Luke Jurevicius, Craig Behenna, Charlotte H...",[Australia],NaT,2015,2,2,"[Kids' TV, TV Comedies]","Imagine your worst fears, then multiply them: ...",2


In [None]:
data["type"].value_counts()

In [None]:
sns.countplot(data = data,x="type")
plt.title("Count of Movies/Shows")
plt.show()

In [None]:
data["director"].value_counts()

In [None]:
A =data["director"].value_counts().reset_index()
sns.histplot(A["director"],bins=19)

In [None]:
data.explode("country")["country"].value_counts()

In [None]:
plt.figure(figsize=(40,8))
plt.xticks(rotation=90,fontsize = 10)
sns.countplot(data = data.explode("country"), x ="country" ,order =data.explode("country")["country"].value_counts().index )

In [None]:
A = data.explode("country")["country"].value_counts().reset_index().iloc[:20]
plt.figure(figsize=(40,8))
plt.xticks(rotation=45,fontsize = 20)
sns.barplot(data=A,x="index",y="country")

In [None]:
showdata = data[data["type"] == "TV Show"].reset_index()
moviedata = data[data["type"] == "Movie"].reset_index()

In [None]:
data.columns

In [None]:
plt.figure(figsize=(20,8))
sns.countplot(data= data["date_added"].dt.week.reset_index(), x="date_added")
plt.show()

In [None]:
data.explode("director").groupby("director")["title"].count().sort_values(ascending=False)