In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np
import os
from pathlib import Path 
from datetime import datetime, timedelta, date
import pycountry_convert as pc

In [2]:
viewership_df = pd.read_csv('ViewershipDataEdit.csv')
viewership_df

Unnamed: 0,Date,Feed_Name,Country,Device Type,Session Count,Total Viewership Seconds,Unique Viewers
0,2020-08-18,Feed_002,France,Mobile Phone,2,127,2
1,2020-08-18,Feed_002,Austria,Smart TV,15,128,15
2,2020-08-18,Feed_002,Bulgaria,Smart TV,1,120,1
3,2020-08-18,Feed_002,France,Smart TV,208,52920,191
4,2020-08-18,Feed_002,Germany,Smart TV,142,11885,134
...,...,...,...,...,...,...,...
675595,2022-10-25,Feed_038,United Kingdom,Mobile Phone,1,68,1
675596,2022-10-25,Feed_038,Germany,Smart TV,2,68,2
675597,2022-10-25,Feed_038,India,Smart TV,16,156,1
675598,2022-10-25,Feed_038,Netherlands,Smart TV,1,66,1


In [3]:
# Convert date column to date format
viewership_df['Date'] = pd.to_datetime(viewership_df['Date'], dayfirst=True, errors='raise')
viewership_df.head()

Unnamed: 0,Date,Feed_Name,Country,Device Type,Session Count,Total Viewership Seconds,Unique Viewers
0,2020-08-18,Feed_002,France,Mobile Phone,2,127,2
1,2020-08-18,Feed_002,Austria,Smart TV,15,128,15
2,2020-08-18,Feed_002,Bulgaria,Smart TV,1,120,1
3,2020-08-18,Feed_002,France,Smart TV,208,52920,191
4,2020-08-18,Feed_002,Germany,Smart TV,142,11885,134


In [4]:
viewership_df.rename(columns={"Country": "country" , "Date" : "date"} , inplace = True)


In [5]:
# Changing the Country Name so it doesnt interfere with the function
viewership_df['country'] = viewership_df['country'].replace("Kosovo", "Albania")
viewership_df['country'] = viewership_df['country'].replace("Runion", "Réunion")
viewership_df['country'] = viewership_df['country'].replace("U.S. Virgin Islands", "Others")
viewership_df['country'] = viewership_df['country'].replace("Western Sahara", "Morocco")
viewership_df['country'] = viewership_df['country'].replace("Vatican City", "Italy")
viewership_df['country'] = viewership_df['country'].replace("St Vincent and Grenadines", "Saint Vincent and the Grenadines")
viewership_df['country'] = viewership_df['country'].replace("Timor-Leste", "Others")
viewership_df['country'] = viewership_df['country'].replace("St Kitts and Nevis", "the Federation of Saint Christopher and Nevis")
viewership_df['country'] = viewership_df['country'].replace("So Tom and Prncipe", "Others")
viewership_df['country'] = viewership_df['country'].replace("Sint Maarten", "Saint Martin")
viewership_df['country'] = viewership_df['country'].replace("Saint Barthlemy", "Others")
viewership_df['country'] = viewership_df['country'].replace("Bonaire Sint Eustatius and Saba", "Others")
viewership_df['country'] = viewership_df['country'].replace("Congo Republic", "Democratic Republic of the Congo")
viewership_df['country'] = viewership_df['country'].replace("Curaao", "Curaçao")

In [6]:
viewership_df.loc[viewership_df['country'] == 'Albania']

Unnamed: 0,date,Feed_Name,country,Device Type,Session Count,Total Viewership Seconds,Unique Viewers
52,2020-08-19,Feed_002,Albania,Smart TV,20,901,15
114,2020-08-19,Feed_004,Albania,Smart TV,31,840,21
174,2020-08-19,Feed_005,Albania,Smart TV,32,549,21
202,2020-08-19,Feed_005,Albania,Smart TV,3,68,2
240,2020-08-20,Feed_002,Albania,Smart TV,18,10083,14
...,...,...,...,...,...,...,...
674748,2022-10-25,Feed_010,Albania,Smart TV,23,139,20
675086,2022-10-25,Feed_011,Albania,Others,2,60,1
675115,2022-10-25,Feed_011,Albania,Smart TV,21,2000,17
675443,2022-10-25,Feed_012,Albania,Others,3,185,1


In [7]:
# Bucketing countries into regions
# Conversion Function
def convert(row):
    try:
        country_code = pc.country_name_to_country_alpha2(row.country, cn_name_format = "default")
        continent_code = pc.country_alpha2_to_continent_code(country_code)
        return continent_code
    except:
        print("Country not found")

In [8]:
viewership_df['region'] = viewership_df.apply(convert, axis=1)
viewership_df

Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country not found
Country no

Unnamed: 0,date,Feed_Name,country,Device Type,Session Count,Total Viewership Seconds,Unique Viewers,region
0,2020-08-18,Feed_002,France,Mobile Phone,2,127,2,EU
1,2020-08-18,Feed_002,Austria,Smart TV,15,128,15,EU
2,2020-08-18,Feed_002,Bulgaria,Smart TV,1,120,1,EU
3,2020-08-18,Feed_002,France,Smart TV,208,52920,191,EU
4,2020-08-18,Feed_002,Germany,Smart TV,142,11885,134,EU
...,...,...,...,...,...,...,...,...
675595,2022-10-25,Feed_038,United Kingdom,Mobile Phone,1,68,1,EU
675596,2022-10-25,Feed_038,Germany,Smart TV,2,68,2,EU
675597,2022-10-25,Feed_038,India,Smart TV,16,156,1,AS
675598,2022-10-25,Feed_038,Netherlands,Smart TV,1,66,1,EU


In [10]:
# mapping continent codes to continent names 
continent_names = { 'NA' : 'North America',
                    'AS' : 'Asia',
                    'EU' : 'Europe',
                    'SA' : 'South America',
                    'AF' : 'Africa',
                    'OC' : 'Oceania'}

viewership_df['region'] = viewership_df['region'].map(continent_names)
viewership_df

Unnamed: 0,date,Feed_Name,country,Device Type,Session Count,Total Viewership Seconds,Unique Viewers,region
0,2020-08-18,Feed_002,France,Mobile Phone,2,127,2,Europe
1,2020-08-18,Feed_002,Austria,Smart TV,15,128,15,Europe
2,2020-08-18,Feed_002,Bulgaria,Smart TV,1,120,1,Europe
3,2020-08-18,Feed_002,France,Smart TV,208,52920,191,Europe
4,2020-08-18,Feed_002,Germany,Smart TV,142,11885,134,Europe
...,...,...,...,...,...,...,...,...
675595,2022-10-25,Feed_038,United Kingdom,Mobile Phone,1,68,1,Europe
675596,2022-10-25,Feed_038,Germany,Smart TV,2,68,2,Europe
675597,2022-10-25,Feed_038,India,Smart TV,16,156,1,Asia
675598,2022-10-25,Feed_038,Netherlands,Smart TV,1,66,1,Europe


In [12]:
#Identifying bad Data
viewership_df[viewership_df['country'].isnull()]

Unnamed: 0,date,Feed_Name,country,Device Type,Session Count,Total Viewership Seconds,Unique Viewers,region
239,2020-08-20,Feed_002,,Smart TV,1,125,1,
362,2020-08-20,Feed_005,,Smart TV,1,61,1,
1676,2020-08-27,Feed_002,,Smart TV,1,180,1,
1745,2020-08-27,Feed_004,,Smart TV,1,189,1,
1813,2020-08-27,Feed_005,,Smart TV,1,64,1,
...,...,...,...,...,...,...,...,...
487076,2022-05-25,Feed_043,,Roku,1,61,1,
487535,2022-05-26,Feed_041,,Roku,1,127,1,
488020,2022-05-26,Feed_042,,Roku,2,122,1,
490392,2022-05-28,Feed_014,,Mobile Phone,1,186,1,


In [13]:
#Dropping rows with no country
viewership_copy = viewership_df.dropna(subset = ['country'])
#verifying that all empty country rows are not included
viewership_copy[viewership_copy['country'].isnull()]

Unnamed: 0,date,Feed_Name,country,Device Type,Session Count,Total Viewership Seconds,Unique Viewers,region


In [14]:
#Identifying missing values form the Session Count column
viewership_copy.loc[viewership_copy['Session Count'] == 0]
#Deselecting bad rows
viewership_clean = viewership_copy[viewership_copy['Session Count'].notna()]

In [15]:
filepath = Path('Data/VW.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
viewership_clean.to_csv(filepath, index=False)