## Libraries Imports

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from tqdm import tqdm
from scipy.stats import ks_2samp
import matplotlib.pyplot as plt
import warnings
import numpy as np
warnings.filterwarnings('ignore', '.*do not.*', )

## Load data and display dataset's info

In [None]:
data = pd.read_csv("compressed_dataset.csv")

In [None]:
data.info()

In [None]:
data.head()

## Change cateorical values to numerical and extract dates from timestamp

In [None]:
labelencoder = LabelEncoder()
data['viewer_type'] = labelencoder.fit_transform(data['viewer_type'])

data['date/time'] = pd.to_datetime(data['timestamp'],unit = 'ms')
data['date'] = data['date/time'].dt.date
data['day_of_week'] = data['date/time'].dt.weekday

In [None]:
data.head()

## Engagement Distribution Over Time (Question 1)

### Customer_id

In [None]:
customers = data.customer_id.drop_duplicates().tolist()
counter = 0
for i in range(int(len(customers) / 5)):  
  fig, ax = plt.subplots(2)
  for j in range(5):
    df = data.loc[data.customer_id == customers[counter]].groupby('date')
    days = list(df.groups.keys())
    ax[0].plot(days,df['engagement'].mean(),label= str(customers[counter]) + " mean")
    ax[0].legend()
    ax[1].plot(days,df['engagement'].std(),label= str(customers[counter]) + " std")
    ax[1].legend()
    counter += 1
  plt.show()

### Country_id

In [None]:
countries = data.country_id.drop_duplicates().tolist()
counter = 0
for i in range(int(len(countries) / 2)):  
  fig, ax = plt.subplots(2)
  for j in range(2):
    df = data.loc[data.country_id == countries[counter]].groupby('date')
    days = list(df.groups.keys())
    ax[0].plot(days,df['engagement'].mean(),label= str(countries[counter]) + " mean")
    ax[0].legend()
    ax[1].plot(days,df['engagement'].std(),label= str(countries[counter]) + " std")
    ax[1].legend()
    counter += 1
  plt.show()

### City_id

In [None]:
cities = data.city_id.drop_duplicates().tolist()
counter = 0
for i in range(int(len(cities) / 3)):  
  fig, ax = plt.subplots(2)
  for j in range(3):
    df = data.loc[data.city_id == cities[counter]].groupby('date')
    days = list(df.groups.keys())
    ax[0].plot(days,df['engagement'].mean(),label= str(cities[counter]) + " mean")
    ax[0].legend()
    ax[1].plot(days,df['engagement'].std(),label= str(cities[counter]) + " std")
    ax[1].legend()
    counter += 1
  plt.show()

## QoE Distribution Over Time (Question 2)

### Customer_id

In [None]:
customers = data.customer_id.drop_duplicates().tolist()
counter = 0
for i in range(int(len(customers) / 5)):  
  fig, ax = plt.subplots(2)
  for j in range(5):
    df = data.loc[data.customer_id == customers[counter]].groupby('date')
    days = list(df.groups.keys())
    ax[0].plot(days,df['qoe'].mean(),label= str(customers[counter]) + " mean")
    ax[0].legend()
    ax[1].plot(days,df['qoe'].std(),label= str(customers[counter]) + " std")
    ax[1].legend()
    counter += 1
  plt.show()

### Country_id

In [None]:
countries = data.customer_id.drop_duplicates().tolist()
counter = 0
for i in range(int(len(countries) / 2)):  
  fig, ax = plt.subplots(2)
  for j in range(2):
    df = data.loc[data.country_id == countries[counter]].groupby('date')
    days = list(df.groups.keys())
    ax[0].plot(days,df['qoe'].mean(),label= str(countries[counter]) + " mean")
    ax[0].legend()
    ax[1].plot(days,df['qoe'].std(),label= str(countries[counter]) + " std")
    ax[1].legend()
    counter += 1
  plt.show()

### City_id

In [None]:
cities = data.city_id.drop_duplicates().tolist()
counter = 0
for i in range(int(len(cities) / 3)):  
  fig, ax = plt.subplots(2)
  for j in range(3):
    df = data.loc[data.city_id == cities[counter]].groupby('date')
    days = list(df.groups.keys())
    ax[0].plot(days,df['engagement'].mean(),label= str(cities[counter]) + " mean")
    ax[0].legend()
    ax[1].plot(days,df['engagement'].std(),label= str(cities[counter]) + " std")
    ax[1].legend()
    counter += 1
  plt.show()

## Engagement/QoE distribution differences based on viewer_type (Question 3)

In [None]:
office = data.loc[data.viewer_type == 0,('engagement','qoe')]
home = data.loc[data.viewer_type == 1,('engagement','qoe')]

print("Kolmogorov–Smirnov test for engagement is " + str(ks_2samp(office['engagement'],home['engagement']).statistic))
print("Kolmogorov–Smirnov test for QoE is " + str(ks_2samp(office['qoe'],home['qoe']).statistic))

del office
del home

## Viewers engagement level duration over country/city/viewer_type (Question 4)

In [None]:
cols = ['customer_id','country_id','city_id','engagement','viewer_type']
df = data[cols]
conditions = [
    (df['engagement'] <= 0.25),
    (df['engagement'] > 0.25) & (df['engagement'] <= 0.5),
    (df['engagement'] > 0.5) & (df['engagement'] <= 0.75),
    (df['engagement'] <= 1)
    ]

# create a list of the values we want to assign for each condition
values = ['very low','low','avg','high']

# create a new column and use np.select to assign values to it using our lists as arguments
df['tier'] = np.select(conditions, values)

# display updated DataFrame
df.head()

### Viewer_type

In [None]:
grouped = df.groupby('viewer_type')
for key, item in grouped:
  x = grouped.get_group(key)['tier'].tolist()
  Y = np.array([x.count('very low'), x.count('low'), x.count('avg'), x.count('high')])
  plt.bar(x = values, height = Y,width = 0.1, label = str(key))
  plt.legend()
  plt.show()

## Country

In [None]:
grouped = df.groupby('country_id')
counter = 0 
for key, item in grouped:
  x = grouped.get_group(key)['tier'].tolist()
  Y = np.array([x.count('very low'), x.count('low'), x.count('avg'), x.count('high')])
  plt.bar(x = values, height = Y,width = 0.1, label = str(key))
  plt.legend()
  counter += 1
  if counter == 5:
    plt.show()
    counter = 0

## City 

In [None]:
grouped = df.groupby('city_id')
counter = 0
for key, item in grouped:
  x = grouped.get_group(key)['tier'].tolist()
  Y = np.array([x.count('very low'), x.count('low'), x.count('avg'), x.count('high')])
  plt.bar(x = values, height = Y,width = 0.1, label = str(key))
  plt.legend()
  counter += 1
  if counter == 5:
    plt.show()
    counter = 0

## Countries/Cities that follow different distributions per customer(Question 5)

## Countries

In [None]:
cols = ['customer_id','country_id','engagement']
df = data[cols]

customer_group = df.groupby('customer_id')
lst = []
for key, index in (customer_group):
    custom = customer_group.get_group(key)
    countries = custom['country_id'].drop_duplicates().tolist()
    for country1 in countries:
        c1_eng = custom[custom.country_id == country1]['engagement']
        for country2 in countries:
            c2_eng = custom[custom.country_id == country2]['engagement']
            if ks_2samp(c1_eng,c2_eng).statistic == 1:
                print("For customer " + str(key) + " and for countries" + str((country1,country2)) + " the distributions of engagement are different")

## City

In [None]:
cols = ['customer_id','city_id','engagement']
df = data[cols]

customer_group = df.groupby('customer_id')
lst = []
for key, index in tqdm(customer_group):
    custom = customer_group.get_group(key)
    cities = custom['city_id'].drop_duplicates().tolist()
    for city1 in cities:
        c1_eng = custom[custom.city_id == city1]['engagement']
        for city2 in cities:
            c2_eng = custom[custom.city_id == city2]['engagement']
            if ks_2samp(c1_eng,c2_eng).statistic == 1:
                lst.append((key,city1,city2))

for i in range(10):
    print(lst[i])

del lst

## Correlation between original data (Question 6)

In [None]:
cols = ['timestamp','customer_id','viewer_id','city_id','country_id','viewer_type','qoe','engagement','event_id']
df = data[cols]
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.heatmap(data = df.corr(),annot = True)

## Correlation between engagement and various factors(Question 7)

### Correlation between Number of viewers / Engagement

In [None]:
## Number of viewers
cols = ['timestamp','event_id','viewer_id','engagement']
df = data[cols]
events = df['event_id'].drop_duplicates().tolist()
number_of_viewers = []
for event in tqdm(events,desc="Calculating Number of Viewers"):
  number_of_viewers.append(data.loc[data['event_id'] == event]['viewer_id'].drop_duplicates().count())

event_viewers = dict(zip(events,number_of_viewers))
df['Number_Of_Viewers'] = df['event_id'].map(event_viewers)

del event_viewers,events,number_of_viewers
df.corr()

### Correlation between Day of the event / Engagement

In [None]:
cols = ['engagement','day_of_week']
df = data[cols]
df.corr()

### Correlation between Duration of the event / Engagement

In [None]:
## Event Duration
cols = ['event_id','timestamp','engagement']
df = data[cols]
events = df['event_id'].drop_duplicates().tolist()
duration = []
for event in tqdm(events):
  l = data.loc[data['event_id'] == event]['timestamp'].tolist()
  l.sort()
  duration.append(l[-1] - l[0])
event_duration = dict(zip(events,duration))
df['Event_Duration'] = df['event_id'].map(event_duration)
del duration, event_duration, l, events
df.corr()

### Correlation between Countries / Engagement

In [None]:
cols = ['engagement','country_id']
data[cols].corr()

### Correlation between Retention / Engagement

In [None]:
def retention(series):
  lst = series.tolist()
  sum = 0
  for i in range(1,len(lst)):
    sum += 30000
  #ret = pd.Series([sum for _ in range(len(lst))])
  return sum

In [None]:
## Viewer Retention
cols = ['event_id','viewer_id','timestamp','engagement']
temp = data[cols]
df = temp.sort_values(by='timestamp').groupby(['event_id','viewer_id','engagement']).agg({'timestamp':retention}).reset_index().rename(columns = {'timestamp':'retention'})
df.corr()