In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Data Exploration

In [2]:
df = pd.read_csv('Netflix Userbase.csv')

In [3]:
# show first 5 rows
df.head()

In [4]:
# show last 5 rows
df.tail()

In [5]:
# Dstaset shape(rows,columns)
df.shape

In [6]:
# Getting information about dataset
df.info()

In [7]:
# recast data type to datetime
df['Join Date'] = df['Join Date'].astype('datetime64[ns]')
df['Last Payment Date'] = df['Last Payment Date'].astype('datetime64[ns]')

In [8]:
# another way
# df['Join Date'] = pd.to_datetime(df['Join Date'])
# df['Last Payment Date'] = pd.to_datetime(df['Last Payment Date'])

In [9]:
df['Join Date'].dtype

In [10]:
# check duplicated rows
print(df.duplicated().any())

In [11]:
# get overall statistics
df[['Monthly Revenue','Age']].describe()

In [12]:
# categorical data info
df.describe(include="O")

In [13]:
# check outliers
fig, ax = plt.subplots(1,2,figsize=(15,5))
ax[0].boxplot(df['Monthly Revenue'])
ax[0].set_title("Monthly Revenue")
ax[1].boxplot(df['Age'])
ax[1].set_title("Age")

In [14]:
# Check number of unique values in data
df.nunique()

In [15]:
# The most frequent subscribers age
df['Age'].value_counts().idxmax()

In [16]:
# The number of subscribers in each Subscription Type
df['Subscription Type'].value_counts()

In [17]:
# Each device count
df['Device'].value_counts()

In [18]:
# The number of subscribers of both genders(Male, Female)
df['Gender'].value_counts()

In [19]:
# The country with the most subscribers 
df['Country'].value_counts().idxmax()

In [20]:
# The country with the least number of subscribers 
df['Country'].value_counts().idxmin()

In [21]:
# The last subscription date
df['Join Date'].max()

In [22]:
# The first subscription date
df['Join Date'].min()

In [23]:
# The most device is used
df['Device'].value_counts().idxmax()

## Visualizing

In [24]:
Basic = df.loc[df['Subscription Type']=='Basic','Monthly Revenue'].sum()
Standard = df.loc[df['Subscription Type']=='Standard','Monthly Revenue'].sum()
Premium  = df.loc[df['Subscription Type']=='Premium','Monthly Revenue'].sum()

In [25]:
RevData = {'Subscription Type':['Basic','Standard','Premium'],'Monthly Revenue':[Basic,Standard,Premium]}

In [26]:
sns.barplot(x = 'Subscription Type', y = 'Monthly Revenue', data=RevData)
plt.title('Total monthly revenue of each Netflix subscription type')
plt.show()

In [27]:
devices = df['Device'].value_counts()

In [28]:
plt.pie(devices,labels=devices.index,autopct='%.2f')
plt.title('Devices Which is used for netflix')
plt.show()

In [29]:
gender = df['Gender'].value_counts()

In [30]:
plt.pie(gender,labels=gender.index,autopct='%.2f')
plt.title('Gender')
plt.show()

In [31]:
# Subscription types counts by country
subscription_counts = df.groupby(['Country', 'Subscription Type'])['User ID'].count().unstack()
subscription_counts.plot(kind = 'bar',
       stacked = True)
plt.xlabel('Country')
plt.ylabel('Number of subscriptions')
plt.title('Subscription types counts by country')
plt.legend(title='Subscription Type')
plt.show()

In [32]:
# Distribution of subscribers 
Age = df['Age'].value_counts()
Age.plot(kind='bar')
plt.xlabel('Age')
plt.ylabel('Number of subscribers')
plt.title('Distribution of subscribers')
plt.show()