## Exploratory Data Analysis

### Import libraries


In [1]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from ipywidgets import interactive, widgets, interact

import warnings
warnings.filterwarnings('ignore')

### Load the data

In [2]:
def load_data(path, language = ""):
    filenames = glob.glob(path + "/*.csv")
    dfs = []
    for filename in filenames:
        dfs.append(pd.read_csv(filename))

    # concatenate all data into one DataFrame
    df = pd.concat(dfs, ignore_index=True).dropna(axis=1)
    # drop empty and single value columns 
    df = df.loc[:,df.apply(pd.Series.nunique) != 1]
    # create a datetime column
    df["date"] = pd.to_datetime(df['year'].astype(str)  + df['month'], format='%Y%B')
    df = df.drop(['year', 'month'], axis=1)
    df = df.rename(columns={"url": "channel"})
    
    # Select only relevant columns
    df_final = df[["channel", "date", "viewminutes"]]
    
    # Add language
    if language != "": df_final["language"] = language 
    
    
    return df_final

In [3]:
english = load_data(r'/home/mogan/Desktop/English', 'English')
french = load_data(r'/home/mogan/Desktop/French', 'French')
german = load_data(r'/home/mogan/Desktop/German', 'German')
italian = load_data(r'/home/mogan/Desktop/Italian', 'Italian')
all_ = load_data(r'/home/mogan/Desktop/All')

# For chess we need the data with the languages annotated
en_fr_ge_it = english.append(german).append(french).append(italian)
en_fr_ge_it = en_fr_ge_it.set_index(np.arange(len(en_fr_ge_it)))

### Some basic stats about our dataset (all languages together)

In [4]:
all_["logviewminutes"] = np.log(all_['viewminutes']+1)
all_.describe()

Unnamed: 0,viewminutes,logviewminutes
count,1234224.0,1234224.0
mean,2761904.0,7.861253
std,92080770.0,3.393106
min,0.0,0.0
25%,300.0,5.70711
50%,1860.0,7.528869
75%,19560.0,9.881293
max,20673120000.0,23.7521


In [5]:
all_.groupby(["date", "channel"]).sum().sort_values(by=["date", "viewminutes"], ascending = False).head(5)


Unnamed: 0_level_0,Unnamed: 1_level_0,viewminutes,logviewminutes
date,channel,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-03-01,Just_Chatting,16223722140,23.50974
2021-03-01,Grand_Theft_Auto_V,11074670280,23.127926
2021-03-01,League_of_Legends,8987191080,22.919066
2021-03-01,Fortnite,7040576820,22.674956
2021-03-01,Call_of_Duty_Modern_Warfare,5456146920,22.420009


### Extract the n most popular channels over all languages

In [6]:
def get_n_most_popular(df, n = 5, language = True):
    top_n = all_.groupby("channel").sum()["viewminutes"].sort_values(ascending=False).head(n)
    top_n = list(top_n.index.values)
    if language:
        temp = df[df['channel'].isin(top_n)][['viewminutes','date','channel', 'language']].sort_values(by=["channel"])
    else:
        temp = df[df['channel'].isin(top_n)][['viewminutes','date','channel']].sort_values(by=["channel"])
        
    log_x=True
    return temp

In [7]:
n = 5
pop_en_fr_ge = get_n_most_popular(en_fr_ge_it, n)
pop_all = get_n_most_popular(all_, n, language = False)
pop_all['language'] = "All"

# append for plotting
pop = pop_en_fr_ge.append(pop_all)

### Plot time series of most popular channels

In [8]:
def plot_most_popular(language):
    if not language:
        return
    
    df = pop[pop.language.isin([language[0]])]
    df = df.drop_duplicates(keep='first')
    df = df.pivot(index='date', columns='channel', values = 'viewminutes')

    fig = px.line(df)
    fig.update_layout(hovermode="x")
    fig.update_layout(
    title=f"Viewminutes of the most popular channels on Twitch.tv in {language[0]}",
    xaxis_title="Year",
    yaxis_title="Viewminutes",
    legend_title="Channel",
    )
    # lockdown start italy
    fig.add_vline(x="2020-03-09")
    fig.add_annotation(
    x="2020-03-09",
    y=100,
    text="Lockdowns in Europe start",
    textangle=-90
    )
    
    fig.show()

In [9]:
language_selector = widgets.SelectMultiple(
    options = np.sort(pop.language.unique()),
    description = 'Languages: '
)

_ = interact(plot_most_popular, language = language_selector)

interactive(children=(SelectMultiple(description='Languages: ', options=('All', 'English', 'French', 'German',…

### Plot timeseries of Chess

In [None]:
def plot_chess(df):
    chess_s = df[df["channel"] == 'Chess'][["viewminutes", "date", 'language']]
    chess_s = chess_s.sort_values(by=["date"])#.set_index("date")
    chess_s = chess_s.drop_duplicates(keep='first')
    chess_s = chess_s.pivot(index='date', columns='language', values = 'viewminutes')
    
    # compute over all languages
    chess_a = all_[all_["channel"] == 'Chess']
    chess_a = chess_a.sort_values(by=["date"])#.set_index("date")
    chess_a = chess_a.drop_duplicates(keep='first')
    chess_a = chess_a.pivot(index='date', columns='channel', values = 'viewminutes')
    
    # merge and reorder and rename for plotting
    chess = pd.merge(chess_s, chess_a, left_index = True, right_index = True)
    chess = chess.rename({"Chess":"All"}, axis='columns')
    chess = chess[["All", "English", "French", "German", "Italian"]]
    
    fig = px.line(chess)
    fig.update_layout(hovermode="x")
    fig.update_layout(
    title=f"Viewminutes of the Chess channel on Twitch.tv in different languages",
    xaxis_title="Year",
    yaxis_title="Viewminutes",
    legend_title="Language",
    )
    
    # day of Queen's Gambit TV Show release
    fig.add_vline(x="2020-10-23")
    fig.add_annotation(x="2020-10-23", y=100,
            text="Release of Queen's Gambit",
            textangle=-90)
    
    # lockdown start italy
    fig.add_vline(x="2020-03-09")
    fig.add_annotation(
    x="2020-03-09",
    y=100,
    text="Lockdowns in Europe start",
    textangle=-90
    )
    return fig

In [None]:
chess_fig = plot_chess(en_fr_ge_it)
chess_fig