# **Project : Top Movie Streaming**

***Analyzing Main Streaming Services***

## Skills Involved :

**DATA ANALYSIS**

**DATA VISUALISING**

**DATA CLEANSING**

**PYTHON**

## Starting :

### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import HTML
import plotly.offline as pyo
pyo.init_notebook_mode()
import plotly.express as px
import random
%matplotlib inline

Output hidden; open in https://colab.research.google.com to view.

### Coding Ground

#### Data Cleansing

In [2]:
df=pd.read_csv('moviestreams.csv')
df.drop(['Unnamed: 0'],axis=1,inplace=True)
# df.to_csv('moviestreams.csv',index=False)
df

FileNotFoundError: ignored

In [None]:
colors=['brown','red','orange','salmon','purple','blue','green','lightblue','lightsalmon']
df.shape

In [None]:
df.count()

In [None]:
cols = df.columns.to_list()
cols

In [None]:
df.isna().sum()

REMOVING '+' IN AGE :

In [None]:
#Age={'18+':18,'7+':7,'13+':13,'all':0,'16':16}
#df.Age=df.Age.map(Age)
#df

REMOVING '%' IN ROTTEN TOMATOES

In [None]:
df['Rotten Tomatoes'] = df['Rotten Tomatoes'].str.replace('%', '').astype(float)
df

#### Top 10 languages in Streaming Movies

In [None]:
language=pd.DataFrame(dict(df.Language.value_counts().head(10)).items(),columns=['Languages','No. Of Movies'])
fig=px.bar(language,
           x=language.Languages,
           y=language['No. Of Movies'],
           title='Top 10 languages in Streaming Movies',
           text=language['No. Of Movies'],
           height=600)
fig.update_traces(texttemplate='%{text:.4s}',textposition='outside')
fig.show()
#HTML(fig.to_html())

In [None]:
fig=px.pie(language,names=language.Languages,values=language['No. Of Movies'],
           title='Top 10 languages in Streaming Services',
           height=600)
#fig.update_traces(textposition='outside')
fig.show()
#HTML(fig.to_html())

#### Number of Movies in specific age group in All services

In [None]:
#Age Graph Functions
def making_ageGraph(df:pd.DataFrame,stream:str,height:float=600):
    color=random.choice(colors)
    df={'Age':df.Age.value_counts().index,'Counts':df.Age.value_counts()}
    fig = px.bar(df, 
             x='Age',
             y='Counts',
             title=f"Number of Movies in specific age group in {stream} service",
             text='Counts', 
             height=height)
    fig.update_traces(marker_color=color,texttemplate='%{text:.2s}', textposition='outside') #for the text to be outside.
    fig.show()
    #return HTML(fig.to_html())

In [None]:
making_ageGraph(df,'All')

#### Number of Movies in specific age group in Netflix

In [None]:
netflix_df=df[df['Netflix']==1]
making_ageGraph(netflix_df,'Netflix')

#### Number of Movies in specific age group in Amazon Prime Video

In [None]:
prime_df=df[df['Prime Video']==1]
making_ageGraph(prime_df,'Amazon Prime Video')

#### Number of Movies in specific age group in Disney+

In [None]:
Disney_df=df[df['Disney+']==1]
making_ageGraph(Disney_df,'Disney+')

#### Number of Movies in specific age group in Hulu

In [None]:
Hulu_df=df[df['Hulu']==1]
making_ageGraph(Hulu_df,'Hulu')

#### Rotten Tomatoes Score

A Tomatometer score is calculated for a movie or TV show after it receives at least five reviews. When at least 60% of reviews for a movie or TV show are positive, a red tomato is displayed to indicate its Fresh status.

Rotten Tomatoes gives films a score out of 100 based on the averaged reviews of professional film critics. If a film gets a rating of 60 or more it gets a 'fresh' red tomato on the site. Less than 60 and it gets a rotten tomato.

In [None]:
fig = px.bar(df, 
             x=df['Rotten Tomatoes'].value_counts().index, 
             y=df['Rotten Tomatoes'].value_counts(),
             title="Overall Rotten Tomato Ratings",
             text=df['Rotten Tomatoes'].value_counts(), 
             height=600)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside') #for the text to be outside.
fig.show()
#HTML(fig.to_html())

In [None]:
rt_scores = pd.DataFrame({'Streaming Service': ["Prime Video", "Hulu","Disney+","NetFlix"],
                        'Rotten Tomato Score' : [netflix_df['Rotten Tomatoes'].value_counts().iloc[0], 
                                                prime_df['Rotten Tomatoes'].value_counts().iloc[0],
                                                Disney_df['Rotten Tomatoes'].value_counts().iloc[0],
                                                Hulu_df['Rotten Tomatoes'].value_counts().iloc[0]],
                         'Highest Value':[netflix_df['Rotten Tomatoes'].value_counts().index[0], 
                                        prime_df['Rotten Tomatoes'].value_counts().index[0],
                                        Disney_df['Rotten Tomatoes'].value_counts().index[0],
                                        Hulu_df['Rotten Tomatoes'].value_counts().index[0]]})
rt_scores.head()

In [None]:
rt_scores.sort_values(ascending=False, by="Rotten Tomato Score").plot(kind='bar', x='Streaming Service', y='Rotten Tomato Score', 
                            color='Violet', 
                            title="Streaming Service with 100% Rotten Tomato Score")
plt.show()

In [None]:
sorted_rt_score=rt_scores.sort_values(ascending=False, by="Rotten Tomato Score")
fig = px.bar(sorted_rt_score, 
             x=sorted_rt_score['Streaming Service'], 
             y=sorted_rt_score['Rotten Tomato Score'],
             title="Rotten Tomato Ratings For Each Services",
             text=sorted_rt_score['Rotten Tomato Score'], 
             height=600)
fig.update_traces(marker_color='purple',texttemplate='%{text:.2s}', textposition='outside') #for the text to be outside.
fig.show()
#HTML(fig.to_html())

#### IMDB Ratings

In [None]:
#IMDb Graph Functions
def making_IMDbGraph(df:pd.DataFrame,stream:str,height:float=600):
    color=random.choice(colors)
    df=pd.DataFrame(dict(df['IMDb'].value_counts()).items(),columns=['IMDb','Counts'])
    fig = px.bar(df, 
             x=df['IMDb'],
             y=df['Counts'],
             title=f"Overall IMDb Ratings For {stream} Service",
             text=df['IMDb'].value_counts(), 
             height=height)
    fig.update_traces(marker_color=color,texttemplate='%{text:.2s}', textposition='outside') #for the text to be outside.
    fig.show()
    #return HTML(fig.to_html())

##### For All Service

In [None]:
making_IMDbGraph(df,'All')

##### For Netflix

In [None]:
making_IMDbGraph(netflix_df,'Netflix')

##### For Amazon Prime

In [None]:
making_IMDbGraph(prime_df,'Amazon Prime')

##### For Disney+

In [None]:
making_IMDbGraph(Disney_df,'Diseny+')

##### For Hulu

In [None]:
making_IMDbGraph(Hulu_df,'Hulu')

#### Count of Runtime of Movies

In [None]:
RuntimeCount=pd.DataFrame(dict(df.Runtime.value_counts().sort_values(ascending=False)[:10]).items(),
                          columns=['Runtime','Counts'])
RuntimeCount

In [None]:
fig = px.bar(RuntimeCount, 
             x='Runtime', 
             y='Counts',
             title="Count Of Runtimes Of Movies",
             text=RuntimeCount['Runtime'], 
             height=600)
fig.update_traces(marker_color='purple',texttemplate='%{text:.2s}', textposition='outside')
fig.show()
#HTML(fig.to_html())

#### Name of Directors and No. of movies directed by them

In [None]:
df.Directors.value_counts()

In [None]:
DirCount=pd.DataFrame(dict(df.Directors.value_counts()).items(),
                          columns=['Director','No. Of Movies'])
DirCount.sort_values(by='No. Of Movies',ascending=False,inplace=True)
DirCount=DirCount.head(20)
DirCount

In [None]:
fig = px.bar(DirCount, 
             x=DirCount['Director'], 
             y=DirCount['No. Of Movies'],
             title="Directors And The Count Of Movies They Have Directed",
             text=DirCount['No. Of Movies'],
             height=600)
fig.update_traces(textposition='outside')
fig.show()
#HTML(fig.to_html())

#### Movies Directed By Director

In [None]:
def movieDirectedBy(df:pd.DataFrame,name:str):
    dfn=df['Joseph Kane' == df.Directors]
    dfn.fillna('null',inplace=True)
    fig = px.bar(dfn, 
             y=dfn['IMDb'], 
             x=dfn['Title'],
             title=f"Movies Directed By {name}",
             text=dfn['Genres'],
             height=600)
    fig.update_traces(marker_color='salmon',textfont_size=10,textposition='inside')
    fig.show()
    #return HTML(fig.to_html())

In [None]:
movieDirectedBy(df,'Joseph Kane')

#### No. Of Movies Of Different Genres

In [None]:
genres=dict(df.Genres.value_counts())
gen=[]
for i in genres.keys():
    i=i.split(',')
    for j in i:
        gen.append(j.strip())
genres_df=pd.DataFrame(dict(pd.Series(gen).value_counts()).items(),
                       columns=['Genres','No. Of Movies'])
genres_df

In [None]:
fig=px.bar(genres_df,
           x=genres_df.Genres,
           y=genres_df['No. Of Movies'],
           title='Movies In Different Genres',
           text=genres_df['No. Of Movies'],
           height=600)
fig.update_traces(marker_color='brown',textfont_size=10,textposition='outside')
fig.show()
#HTML(fig.to_html())

#### Top Movies

In [None]:
def topMoviesIn(df:pd.DataFrame,stream:str,over:float=8.5):
    color=random.choice(colors)
    data= df[df['IMDb']>over]
    data= data[['Title', 'IMDb', 'Genres']].sort_values(ascending=False, by='IMDb')
    fig=px.bar(data,
           x=data.Title,
           y=data.IMDb,
           title=f'Top Movies in {stream}',
           text=data.Genres,
           height=600)
    fig.update_traces(marker_color=color,textposition='inside')
    fig.show()
    #return HTML(fig.to_html())

##### On Netflix

In [None]:
topMoviesIn(netflix_df,'Netflix')

##### On Amazon Prime

In [None]:
topMoviesIn(prime_df,'Amazon Prime',8.8)

##### On Disney+

In [None]:
topMoviesIn(Disney_df,'Disney+',8)

##### On Hulu

In [None]:
topMoviesIn(Hulu_df,'Hulu',8)

#### Movies Before 1990

In [None]:
def MoviesBefore(df:pd.DataFrame,stream:str,before:int=1990):
    avg=df.Runtime.mean()
    df=df[df.Year.astype(int)<before].nlargest(20,'IMDb','first')
    color=random.choice(colors)
    fig=px.bar(df,
           y=df.Title,
           x=df.Year,
           title=f'Movies Before 1990 On {stream} Stream',
           text=df.Year,
           height=600)
    fig.update_traces(marker_color=color,textposition='inside')
    fig.show()
    #return HTML(fig.to_html())
    

##### On All Streams

In [None]:
MoviesBefore(df,'All')

##### On Netflix

In [None]:
MoviesBefore(netflix_df,'Netflix')

##### On Amazon Prime

In [None]:
MoviesBefore(prime_df,'Amazon Prime')

##### On Disney+

In [None]:
MoviesBefore(Disney_df,'Disney+')

##### On Hulu

In [None]:
MoviesBefore(Hulu_df,'Hulu')

#### Average ScreanTime

In [None]:
netflix_avg = netflix_df.Runtime.mean()
prime_avg = prime_df.Runtime.mean()
Disney_avg = Disney_df.Runtime.mean()
Hulu_avg = Hulu_df.Runtime.mean()
avg = [df.Runtime.mean(),netflix_avg,prime_avg,Disney_avg,Hulu_avg]
AvgDf=pd.DataFrame({'Streaming Service':'Overall Netflix Amazon Disney+ Hulu'.split(' '),'Screen Time':avg})
AvgDf

In [None]:
fig = px.bar(AvgDf,
             y=AvgDf['Streaming Service'],
             x=AvgDf['Screen Time'],
             title='Screen Time On Each Stream',
             text=AvgDf['Screen Time'],
             height=600)
fig.update_traces(texttemplate='%{text:.4s} mins',textposition='inside')
fig.show()
#HTML(fig.to_html())

In [None]:
!jupyter nbconvert --to html TopMovieStreaming.ipynb