# Understanding data

In [None]:
# Importing libraries

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AffinityPropagation
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly as py
import plotly.graph_objs as go
import datetime as dt
import missingno as msno
from wordcloud import WordCloud
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot

In [None]:
# read file
df = pd.read_csv("netflix.csv")

In [None]:
# display first 5 rows
df.head()

In [None]:
# find more info using "describe" function
df.describe()

In [None]:
# find more info using "info" function
df.info()

In [None]:
# find columns with empty fields
df.isnull().sum()

In [None]:
# visualise missing data
msno.matrix(df,figsize = (10,2))

# Working with missing data

In [None]:
# Replace empty 'country' with most common country
df['country'] = df['country'].fillna(df['country'].mode()[0])

# replace empty 'cast' and 'director' with "No Data" - found no need to drop entire rows
df['cast'].replace(np.nan, 'No Data',inplace  = True)
df['director'].replace(np.nan, 'No Data',inplace  = True)

# dropped everything else that was empty
df.dropna(inplace=True)

# dropped duplicate rows
df.drop_duplicates(inplace= True)

In [None]:
# show if all missing data has been corrected
df.isnull().sum()

# Visualizations

In [None]:
# bar graph showing the number of tv shows vs movies on Netflix
sns.set()
ax = sns.countplot(x="type", data=df, palette="Set2")

In [None]:
# bar graph showing the number movies/shows with specic age restrictions (rating)
plt.figure(figsize=(12,10))
sns.set(style="darkgrid")
ax = sns.countplot(x="rating", data=df, palette="Set2", order=df['rating'].value_counts().index[0:15])

In [None]:
# horizontal bar graph showing the amount of movies/shows released in specific years
plt.figure(figsize=(12,10))
sns.set(style="darkgrid")
ax = sns.countplot(y="release_year", data=df, palette="Set2", order=df['release_year'].value_counts().index[0:15])

In [None]:
# Concatenate all genres into one string
all_genres = " ".join(df["listed_in"].apply(lambda x: "".join(x)))

# Create the wordcloud
wordcloud = WordCloud().generate(all_genres)

# Display the wordcloud
import matplotlib.pyplot as plt
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()