### THIS FILE CONTAINS CONFIDENTIAL DATA, ONLY RAW CODE HAS BEEN UPLOADED. INDIVIDUAL CELL OUTPUTS HAVE BEEN OMITTED. ANONYMIZATION KEY AND RAW VIEWERSHIP DATA WILL NOT BE LOADED TO DATABASE AND UNAVAILABLE. THIS CODE WILL DOCUMENT THE CLEANING FOR THE VIEWERSHIP DATA

Merging multiple CSVs to one

In [None]:
### Data Processing Into DataFrames
# Import dependancies
import pandas as pd
from datetime import datetime, timedelta, date
from pathlib import Path


In [None]:
# Merging Minute level aggregation into one table

#Determine input date range
start_date = "2021-02-18"
end_date = "2022-11-06"

#string to datetime
start_date_dt = datetime.strptime(start_date, '%Y-%m-%d').date()
end_date_dt = datetime.strptime(end_date, '%Y-%m-%d').date()

In [None]:
# define a function that will read all the csv files and combine them into one dataframe.

def minute_data_aggregation_condenser(start, end):
    current_date = start
    combined_df = pd.DataFrame(pd.read_csv(f"Resources/minutelevelsessionaggregations-qwest-{start_date_dt}.csv"))
    counter = 1

    while current_date <= end:
        try:
            current_date = current_date + timedelta(days=1)
            current_data = pd.read_csv(f"Resources/minutelevelsessionaggregations-qwest-{current_date}.csv")
            current_df = pd.DataFrame(current_data)
            combined_df = pd.concat([combined_df, current_df])
            counter += 1
        except:
            current_date = current_date + timedelta(days=1)
    else:
        print(f"Data Merge Complete, {counter} files have been merged into a dataframe and exported as merged_aggregate_data{date.today()}.csv")
        return combined_df

combined_df = minute_data_aggregation_condenser(start_date_dt, end_date_dt)
combined_df.head()

In [None]:
#check df length
print(len(combined_df))

In [None]:
# Check columns in df
combined_df.columns

In [None]:
#Extra column named channel;time;content_id;country;total_sessions;total_session_duration_seconds indicates error
# Check that column
len(combined_df.columns)
combined_df['channel;time;content_id;country;total_sessions;total_session_duration_seconds'].value_counts()

In [None]:
# remove all rows containing bad data
combined_df = combined_df[pd.isnull(combined_df['channel;time;content_id;country;total_sessions;total_session_duration_seconds'])]

# Column was semicolon separated rather than comma separated on 2022-06-26
semicolon_data = pd.read_csv("Resources/minutelevelsessionaggregations-qwest-2022-06-26.csv", sep=';')
semicolon_df = pd.DataFrame(semicolon_data)
semicolon_df.head()

#combine with complete dataframe
combined_df=pd.concat([combined_df, semicolon_df])

#remove 'channel;time;content_id;country;total_sessions;total_session_duration_seconds' column
combined_df = combined_df.drop(columns=['channel;time;content_id;country;total_sessions;total_session_duration_seconds'])
combined_df.head()

In [None]:
#check length of df to make sure it matches with previous 
print(len(combined_df))

Anonymizing Key and Channel Data

In [None]:
# import anonymization key
anon_key = pd.read_csv("Resources/Anonymization Keys.csv")
anon_key_df = pd.DataFrame(anon_key)
anon_key_df

anon_key_op_df = anon_key_df[['Operator', 'anonymization key']]
anon_key_chan_df = anon_key_df[['Channel', 'Anonymization key (Genre)']]
anon_key_chan_df = anon_key_chan_df.dropna()

In [None]:
#match channel with operator ### EXPECTED TIME 349minutes REFACTOR THIS CODE IN THE FUTURE TO RUN LIKE THE CONTENT_ID
def string_parser_OPS (string):
    for ops in anon_key_op_df['Operator']:
        if string.str.contains(ops.lower()).any():
            return anon_key_op_df.loc[anon_key_op_df['Operator']== ops, 'anonymization key'].item()

combined_df["Operator"] = combined_df[['channel']].apply(string_parser_OPS, axis =1, result_type='expand')

In [None]:
#match channel with channel ### EXPECTED TIME 244minutes REFACTOR THIS CODE IN THE FUTURE TO RUN LIKE THE CONTENT_ID
def string_parser_CHAN (string):
    for ops in anon_key_chan_df['Channel']:
        if string.str.contains(ops.lower()).any():
            return anon_key_chan_df.loc[anon_key_chan_df['Channel']== ops, 'Anonymization key (Genre)'].item()

combined_df["Channel"] = combined_df[['channel']].apply(string_parser_CHAN, axis =1, result_type='expand')

Anonymizing Program Code and obtaining Genre 

In [None]:
content_data = pd.read_csv('Resources/Media Library.csv')
content_df = pd.DataFrame(content_data)

# Add PRO_ prefix to ID to get Program ID
content_df['PRO_CONTENT_ID'] = 'PRO_' + content_df['🎦  ID']
#invert rows sort by largets to smallest so that PRO_3000 is found as PRO_3000 instead of PRO_3
content_df = content_df.sort_index(ascending=False)
content_df.head()

In [None]:
# filter out content_id with regex to get program number
regex_list = [r'(PRO_*\d*)_[A-Z]',r'(PRO_*\d*\w*)', r'pro(\d{1,4})', r'pro_(\d*\w*)',r'(^\d{1,4})[a-z]',r'pr\d*[a-z]*(\d*)']
regex_filtered_content_id = combined_df.content_id.str.extract('|'.join(regex_list))
# add PRO_ prefix to extracted numbers
for i in range(len(regex_list)-1, 1, -1):
        regex_filtered_content_id[i]='PRO_' + regex_filtered_content_id[i]
#Merge all columns
for i in range(len(regex_list), 0, -1):
    if i-2 >= 0:
        regex_filtered_content_id[i-2] = regex_filtered_content_id[i-2].fillna(regex_filtered_content_id[i-1])
regex_filtered_content_id = regex_filtered_content_id[[0]]

In [None]:
# add column to the dataframe
combined_df['filtered_content_id'] = regex_filtered_content_id

In [None]:
# check the content_id of rows that have nan for filtered_content_id
missed_content_ids = combined_df[combined_df['filtered_content_id'].isnull()]
missed_content_ids = missed_content_ids[['content_id']]
missing_content_id_df = pd.DataFrame(missed_content_ids['content_id'].unique())
missing_content_id_df.head()

In [None]:
# Merge and combined (VLOOKUP)
combined_genre_df = (combined_df.merge(content_df, left_on='filtered_content_id', right_on='PRO_CONTENT_ID'))
combined_genre_df.head()
# 69.65% Match Rate

In [None]:
#Trim useless columns out, and fix column names
trimmed_clean_merged_minute_aggregation = combined_genre_df[['time', 'country', 'total_sessions', 'total_session_duration_seconds', 'Operator', 'Channel', 'filtered_content_id', '🎯  TAG Music Styles (from 🎥 Films)']]
trimmed_clean_merged_minute_aggregation = trimmed_clean_merged_minute_aggregation.rename({'time':'Time', 'country':'Country', 'total_sessions':'Total_Sessions', 'total_session_duration_seconds':'Total_Session_Duration_Seconds', 'Operator':'Operator', 'Channel':'Channel','filtered_content_id':'Cleaned_Content_ID', '🎯  TAG Music Styles (from 🎥 Films)':'Genre'}, axis='columns')
trimmed_clean_merged_minute_aggregation.head()

Exporting final cleaned CSV and missing content_ids for further data cleaning if necesary

In [None]:
# Output new dataframe to CSV
filepath = Path(f'Resources/cleaned_merged_minute_aggregate_data-{date.today()}.csv')
trimmed_clean_merged_minute_aggregation.to_csv(filepath)

# Output problem content_id dataframe to CSV
filepath = Path(f'Resources/problematic_content_id.csv')
missing_content_id_df.to_csv(filepath)