In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction

Presenting in-depth interviews with a range of fascinating and accomplished guests, Sanyam Bhutani's Chai Time Data Science is a trailblazing show for the data science and machine learning community.

After having a dream of seeing a CTDS video inside a Kaggle Notebook, Sanyam heeded the sign from the data gods and has prepared a package on the first 85 episodes for us to explore.

I was curious to see what insights the data might reveal about the audience of the featured episodes.
What patterns are there, if any, with regard to which shows were best received?

# Summary of Findings

In [5]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
import glob
import re
import numpy as np
from matplotlib import pyplot as plt
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff

# Exploration

**Note about visualisations**

In this exploration I will practice using the Plotly library to visualise my analysis. In addition to producing clean, well-formatted default graphics that are easy to polish, Plotly has a neat hover feature, displaying customisable information when hovering the cursor over a data point.

**Preparing the data for use**

As our data is in `.csv` format let us first pull it into pandas DataFrames so it is ready to work with:

In [6]:
anchor_thumb_types = pd.read_csv('Anchor Thumbnail Types.csv')
yt_thumb_types = pd.read_csv('YouTube Thumbnail Types.csv')
descrip = pd.read_csv('Description.csv')
episodes = pd.read_csv('Episodes.csv')
sub_files = glob.glob('Cleaned Subtitles/*')
subs_combined = pd.concat([pd.read_csv(file) for file in sub_files])
all_sets = [anchor_thumb_types, yt_thumb_types, descrip, episodes, subs_combined]

FileNotFoundError: [Errno 2] File Anchor Thumbnail Types.csv does not exist: 'Anchor Thumbnail Types.csv'

In [None]:
# 1) 'Description.csv'
print(descrip.shape)
print(descrip.head(5))
print(descrip.info())

This is a text-rich set summarising each episode. Valuable for content analysis but less so for high-level show statistics.

In [None]:
# 1) 'Episodes.csv'
print(episodes.shape)
print(episodes.head(5))
print(episodes.info())

This looks juicier! The dataset contains several interesting features about the guests,  for each episode

In [None]:
episodes['yt_likes_per_view'] = episodes.apply(lambda row: row['youtube_likes']/row['youtube_views'], axis=1) 

In [None]:
episodes[episodes['heroes'].isna()]

In [None]:
episodes.heroes.fillna('Non-guest episode; '+episodes.episode_name, inplace=True)

Histograms in subplots for each platform

In [None]:
platforms = ['Anchor Plays', 'YouTube Views', 'Spotify Streams', 'Apple Listeners' ]

fig5_data = go.Bar(x=platforms, y=episodes[[
    'anchor_plays',
    'youtube_views',
    'spotify_streams',
    'apple_listeners'
     ]].sum(),
    marker_color=['#8C44FC','#FC0808','#1CD463','#1CB7C7']
                  )
fig5 = go.Figure(data=fig5_data)
fig5.update_layout(title='Total Plays by Platform')

In [None]:
from plotly.subplots import make_subplots


hist_subplots = make_subplots(rows=2, cols=2)
hist_subplots.add_histogram(x=episodes.youtube_views,
                            marker_color='#FC0808',
                            row=1, col=1,
                            name='YouTube Views')

hist_subplots.add_histogram(x=episodes.spotify_streams,
                            marker_color='#1CD463',
                            row=1, col=2,
                            name='Spotify Streams')

hist_subplots.add_histogram(x=episodes.anchor_plays,
                            marker_color='#8C44FC',
                            row=2, col=1,
                            name='Anchor Plays')

hist_subplots.add_histogram(x=episodes.apple_listeners,
                            marker_color='#1CB7C7', row=2, col=2,
                            name='Apple Listeners')

hist_subplots.update_layout(title_text='Distribution of Total Plays per Platform')

In [None]:
avg_play_subplots = make_subplots(rows=2, cols=1)
avg_play_subplots.add_histogram(x=episodes.youtube_avg_watch_duration,
                            marker_color='#FC0808',
                            row=1, col=1,
                            name='YouTube')

avg_play_subplots.add_histogram(x=episodes.apple_avg_listen_duration,
                            marker_color='#1CB7C7',
                            row=2, col=1,
                            name='Apple')
avg_play_subplots.update_layout(title_text='Distribution of Average Play Length per Platform (in Seconds; All Available Data)')

In [None]:
heroes_cat = list(episodes.category.unique())

fig6_data = go.Bar(x=heroes_cat,
                   y=episodes.groupby('category').yt_likes_per_view.mean(),
                   marker_color=['#07F9EC','#FCF9BA','#6FBEE2','#F5BAD2'])
fig6 = go.Figure(data=fig6_data)
fig6.update_layout(title='YouTube Likes per View by Guest Category')

In [None]:
heroes_gender= ['Female', 'Male']

fig4_data = go.Bar(x=heroes_gender,
                   y=episodes.groupby('heroes_gender').yt_likes_per_view.mean(),
                   marker_color=['#FBCC06','#04CCFB'])
fig4 = go.Figure(data=fig4_data)
fig4.update_layout(title='YouTube Likes per View by Guest Gender')

In [None]:
def SetColor(x):
    if x == 'Female':
        return '#FBCC06'
    elif x == 'Male':
        return '#04CCFB'
    else:
        return '#BCBABD'
        

fig5 = go.Figure(data=go.Scatter(x=episodes.youtube_views, y=episodes.yt_likes_per_view,
                                 mode='markers', marker=dict(
                                     size=12,
                                     color=list(map(SetColor, episodes.heroes_gender)),
                                 ),
                                 customdata=episodes[['heroes', 'episode_id', 'youtube_views', 'yt_likes_per_view','flavour_of_tea']],
                                 hovertemplate=
                                 '<b>Guest</b>: %{customdata[0]} <br>YouTube Views: %{customdata[2]}<br>Likes per View: %{customdata[3]: .2f}<br><b>Tea:</b> %{customdata[4]}<extra></extra>'
                                ))
fig5.update_layout(title='<b>YouTube Views & Likes per View</b>\
<br>(Hover over markers for episode details)',
                   xaxis_title='YouTube Views',
                   yaxis_title='Likes per View')

In [None]:
heroes_loc = ['USA', 'Ex-USA']

fig7_data = go.Bar(x=heroes_loc,
                   y=[episodes[episodes['heroes_location']!='USA'].yt_likes_per_view.mean(),
                     episodes[episodes['heroes_location']=='USA'].yt_likes_per_view.mean()],
                   marker_color=['#112C54', '#2CBCEC'])


fig7 = go.Figure(data=fig7_data)
fig7.update_layout(title='YouTube Likes per View by Guest Location')

In [None]:
def SetColor(x):
    if x == 'USA':
        return '#112C54'
    else:
        return '#2CBCEC'
        

fig5 = go.Figure(data=go.Scatter(x=episodes.youtube_views, y=episodes.yt_likes_per_view,
                                 mode='markers', marker=dict(
                                     size=12,
                                     color=list(map(SetColor, episodes.heroes_location)),
                                 ),
                                 customdata=episodes[['heroes', 'episode_id', 'youtube_views', 'yt_likes_per_view', 'flavour_of_tea']],
                                 hovertemplate=
                                 '<b>Guest</b>: %{customdata[0]} <br>YouTube Views: %{customdata[2]} <br>Likes per View: %{customdata[3]: .2f}<br><b>Tea:</b> %{customdata[4]}<extra></extra>'
                                ))
fig5.update_layout(title='<b>YouTube Views & Likes per View</b>\
<br>(hover over markers for episode details)',
                   xaxis_title='YouTube Views',
                   yaxis_title='Likes per View')

Add legend