# **Best Streaming Service Analysis**

In this project, we will analyse the main streamming services like Netflix, Amazon Prime Video, Disney+, Hulu.

**Import the necessary libraries**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import plotly.offline as pyo
# Set notebook mode to work in offline
pyo.init_notebook_mode()
import plotly.express as px 

%matplotlib inline

In [None]:
df= pd.read_csv('Documents/datasets/moviestreams.csv')
df.head()

In [None]:
cols = df.columns.tolist()
cols

In [None]:
df.drop(['Unnamed: 0','ID',], axis=1, inplace = True)
cols = df.columns.tolist()
cols

**Check For Missing Values**

In [None]:
df.isna().sum()

## removing the **'+'** sign attached to the values in the **Age** column.

In [None]:
df['Age']

In [None]:
age_map = {'18+' : 18, '7+' : 7, '13+': 13, 'all' : 0, '16+' : 16}
df['AgeCopy'] = df['Age'].map(age_map)
df['AgeCopy']

## removing the **'%'** sign attached to the values in the **Rotten Tomatoes** column.

In [None]:
df['New_Rotten_Tomatoes'] = df['Rotten Tomatoes'].str.replace("%","")
for i in df['New_Rotten_Tomatoes']:
    if i==str:
        i.astype(int)
df['New_Rotten_Tomatoes']

# **Visualisations**

# What Is The Number Of Movies For Each Age Group?

In [None]:
df['Age'].value_counts()

**Number of Movies in specific age group in All services**

In [None]:
plt.figure(figsize = (5, 3))
plt.title('Age group to which movie belongs')
sns.barplot(x = df['Age'].value_counts().index, y = df['Age'].value_counts())

**Number of Movies in specific age group in Netflix**

In [None]:
from IPython.display import HTML
import plotly.express as px
netflix_df=df[df['Netflix']==1]
fig = px.bar(netflix_df, 
             x=netflix_df['Age'].value_counts().index, 
             y=netflix_df['Age'].value_counts(),
             title="Number of Movies in specific age group in Netflix",
             text=netflix_df['Age'].value_counts(), 
             height=600)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
HTML(fig.to_html())

**Number of Movies in specific age group in Amazon Prime Video**

In [None]:
from IPython.display import HTML
import plotly.express as px
prime_df=df[df['Prime Video']==1]
fig = px.bar(prime_df, 
             x=prime_df['Age'].value_counts().index, 
             y=prime_df['Age'].value_counts(),
             title="Number of Movies in specific age group in Amazon Prime Video",
             text=prime_df['Age'].value_counts(), 
             height=600)
fig.update_traces(marker_color='lightsalmon',texttemplate='%{text:.2s}', textposition='outside') 
HTML(fig.to_html())

**Number of Movies in specific age group in Disney+**

In [None]:
from IPython.display import HTML
import plotly.express as px
Disney_df=df[df['Disney+']==1]
fig = px.bar(Disney_df, 
             x=Disney_df['Age'].value_counts().index, 
             y=Disney_df['Age'].value_counts(),
             title="Number of Movies in specific age group in Disney+ Video",
             text=Disney_df['Age'].value_counts(), 
             height=600)
fig.update_traces(marker_color='red',texttemplate='%{text:.2s}', textposition='outside')
HTML(fig.to_html())

In [None]:
from IPython.display import HTML
import plotly.express as px
Hulu_df=df[df['Hulu']==1]
fig = px.bar(Hulu_df, 
             x=Hulu_df['Age'].value_counts().index, 
             y=Hulu_df['Age'].value_counts(),
             title="Number of Movies in specific age group in Hulu Video",
             text=Hulu_df['Age'].value_counts(), 
             height=600)
fig.update_traces(marker_color='black',texttemplate='%{text:.2s}', textposition='outside')
HTML(fig.to_html())

**Top 10 languages in Streaming Services**

In [None]:
#counting and assigning the 10 top values to a variable
languages = df.Language.value_counts().head(10)
 
plt.figure(figsize=(12,8))
plt.title('Top 10 languages in Streaming Services')
sns.barplot(x=languages.index, y=languages.values)

In [None]:
from IPython.display import HTML
import plotly.express as px
fig = px.pie(df, 
             values=languages.values, 
             names=languages.index, 
             title='Top 10 languages in Streaming Services',
             height=600)
HTML(fig.to_html())

# **Rotten Tomatoes Score**

In [None]:
from IPython.display import HTML
import plotly.express as px

fig = px.bar(df, 
             x=df['Rotten Tomatoes'].value_counts().index, 
             y=df['Rotten Tomatoes'].value_counts(),
             title="Overall Rotten Tomato Ratings",
             text=df['Rotten Tomatoes'].value_counts(), 
             height=600)
fig.update_traces(marker_color='blue',texttemplate='%{text:.2s}', textposition='outside')
HTML(fig.to_html())

## **Rotten Tomato Ratings For Each Services**

In [None]:
rt_scores = pd.DataFrame({'Streaming Service': ["Prime Video", "Hulu","Disney+","NetFlix"],
                                    'Rotten Tomato Score' : [prime_df['Rotten Tomatoes'].value_counts()[0],
                                                             Hulu_df['Rotten Tomatoes'].value_counts()[0],
                                                             Disney_df['Rotten Tomatoes'].value_counts()[0],
                                                             netflix_df['Rotten Tomatoes'].value_counts()[0]  
                                                             ]})
rt_scores.head()

In [None]:
sorted_rt_score=rt_scores.sort_values(ascending=False, by="Rotten Tomato Score")
sorted_rt_score

In [None]:
fig = px.bar(sorted_rt_score, 
             x=sorted_rt_score['Streaming Service'], 
             y=sorted_rt_score['Rotten Tomato Score'],
             title="Rotten Tomato Ratings For Each Services",
             text=sorted_rt_score['Rotten Tomato Score'], 
             height=600)
fig.update_traces(marker_color='purple',texttemplate='%{text:.2s}', textposition='outside')
HTML(fig.to_html())

In [None]:
fig = px.bar(df, 
             y=df['IMDb'].value_counts(), 
             x=df['IMDb'].value_counts().index,
             title="Overall IMDb Ratings For All Services",
             text=df['IMDb'].value_counts(), 
             height=600)
fig.update_traces(marker_color='purple',texttemplate='%{text:.2s}', textposition='outside')
HTML(fig.to_html())

In [None]:
RuntimeCount = pd.DataFrame(dict(df['Runtime'].value_counts().sort_values(ascending=False)[:10]).items(), 
             columns=['Runtime', 'Count'])

RuntimeCount

In [None]:
fig = px.bar(df, 
             x=RuntimeCount['Runtime'], 
             y=RuntimeCount['Count'],
             title="Count Of Runtimes Of Movies",
             text=RuntimeCount['Runtime'], 
             height=600)
fig.update_traces(marker_color='purple',texttemplate='%{text:.2s}', textposition='outside')
HTML(fig.to_html())

In [None]:
new_data = df[df['Directors'] !=np.nan]
directors_count = dict()
direc_in_data = list(new_data['Directors'])
for xdir in direc_in_data:
    curr_dirs = str(xdir).split(",")
    for xd in curr_dirs:
        if xd in directors_count.keys():
            directors_count[xd] = directors_count.get(xd) + 1
        else:
            directors_count[xd] = 1

In [None]:
DirCount = pd.DataFrame(directors_count.items(), columns=['Director', 'Count'])
DirCount=DirCount.sort_values(ascending=False, by='Count').head(20)
DirCount   

In [None]:
DirCount=DirCount.drop(56, axis=0)
DirCount

In [None]:
fig = px.bar(DirCount, 
             x=DirCount['Director'], 
             y=DirCount['Count'],
             title="Directors And The Count Of Movies They Have Directed",
             text=DirCount['Count'],
             height=600)
fig.update_traces(marker_color='purple',texttemplate='%{text:.2s}', textposition='outside') #for the text to be outside.
HTML(fig.to_html())

### **Exploring Genres**

In [None]:
genres_= dict(df['Genres'].value_counts())
count_genres = dict()
for g,count in genres_.items():
    g = g.split(",")
    for i in g:
        if i in count_genres.keys():
            count_genres[i] = count_genres.get(i)+1
        else:
            count_genres[i] = 1   

In [None]:
count_genres_df = pd.DataFrame(count_genres.items(), columns=['Genre', 'Count'])
count_genres_df

In [None]:
fig = px.bar(count_genres_df, 
             x=count_genres_df['Genre'], 
             y=count_genres_df['Count'],
             title="Genre And their Counts",
             text=count_genres_df['Count'],
             height=600)
fig.update_traces(marker_color='lightsalmon',texttemplate='%{text:.2s}', textposition='outside')
HTML(fig.to_html())

# **Top movies on each platform**

In [None]:
netflix_top_movies = netflix_df[netflix_df['IMDb'] > 8.5]
netflix_top_movies_df = pd.DataFrame({'movie_name':netflix_top_movies['Title'], 'imdb_ratings':netflix_top_movies['IMDb']})
netflix_top_movies_df = netflix_top_movies_df.sort_values(ascending=False,by='imdb_ratings')
netflix_top_movies_df

In [None]:
plt.figure(figsize=(5, 10))
plt.title('Highest rated movies on Netfilx')
sns.barplot(x=netflix_top_movies_df['imdb_ratings'], y=netflix_top_movies_df['movie_name'])

In [None]:
prime_top_movies = prime_df[prime_df['IMDb'] > 8.8]
prime_top_movies_df = pd.DataFrame({'movie_name':prime_top_movies['Title'], 'imdb_ratings':prime_top_movies['IMDb']})
prime_top_movies_df = prime_top_movies_df.sort_values(ascending=False,by='imdb_ratings')
prime_top_movies_df

In [None]:
plt.figure(figsize=(5, 10))
plt.title('Highest rated movies on Prime')
sns.barplot(x=prime_top_movies_df['imdb_ratings'], y=prime_top_movies_df['movie_name'])

In [None]:
disney_top = Disney_df[Disney_df['IMDb']>8.5]
disney_top = disney_top[['Title', 'IMDb']].sort_values(ascending=False, by='IMDb')
disney_top

In [None]:
fig = px.bar(disney_top, 
             x=disney_top['Title'], 
             y=disney_top['IMDb'],
             title="Top Movies On Disney+",
             text=disney_top['IMDb'],
             height=800)
fig.update_traces(marker_color='lightblue',texttemplate='%{text:.2s}', textposition='outside')
HTML(fig.to_html())

In [None]:
hulu_top = Hulu_df[Hulu_df['IMDb']>8.5]
hulu_top = hulu_top[['Title', 'IMDb']].sort_values(ascending=False, by='IMDb')
hulu_top

In [None]:
fig = px.bar(hulu_top, 
             x=hulu_top['Title'], 
             y=hulu_top['IMDb'],
             title="Top Movies On Hulu",
             text=hulu_top['IMDb'],
             height=800)
fig.update_traces(marker_color='purple',texttemplate='%{text:.2s}', textposition='outside') #for the text to be outside.
HTML(fig.to_html())

# **Average streaming time on each platform**

In [None]:
avg_runtime = pd.DataFrame({'avg_across_platforms': [netflix_df['Runtime'].mean(), prime_df['Runtime'].mean(), Disney_df['Runtime'].mean(), Hulu_df['Runtime'].mean()]}, index = ['Netflix_avg', 'Prime_avg', 'Disney_avg', 'Hulu_avg'])
avg_runtime

In [None]:
print(avg_runtime['avg_across_platforms'].values)

In [None]:
avg_runtime.index

In [None]:
fig = px.bar(avg_runtime,  
             x=avg_runtime.index,
             y=avg_runtime['avg_across_platforms'].values,
             title="average runtime of movies across platforms",
             text=avg_runtime['avg_across_platforms'].values,
             height=800)
fig.update_traces(marker_color='purple',texttemplate='%{text:.2s}', textposition='outside') 
HTML(fig.to_html())

# **Movies Released after 1990**

In [None]:
released_after_1990 = df[df['Year'] > 1990]
released_after_1990_df = pd.DataFrame({'movie_name': released_after_1990['Title'], 'release_year': released_after_1990['Year']})
released_after_1990_df