In [1]:
### Data Processing Into DataFrames

# Import dependancies
import pandas as pd
from datetime import datetime, timedelta, date
from pathlib import Path


In [2]:
# Merging Minute level aggregation into one table

#Determine input date range
start_date = "2021-02-18"
end_date = "2022-11-06"

#string to datetime
start_date_dt = datetime.strptime(start_date, '%Y-%m-%d').date()
end_date_dt = datetime.strptime(end_date, '%Y-%m-%d').date()

In [3]:
# define a function that will read all the csv files and combine them into one dataframe.

def minute_data_aggregation_condenser(start, end):
    current_date = start
    combined_df = pd.DataFrame(pd.read_csv(f"Resources/minutelevelsessionaggregations-qwest-{start_date_dt}.csv"))
    counter = 1

    while current_date <= end:
        try:
            current_date = current_date + timedelta(days=1)
            current_data = pd.read_csv(f"Resources/minutelevelsessionaggregations-qwest-{current_date}.csv")
            current_df = pd.DataFrame(current_data)
            combined_df = pd.concat([combined_df, current_df])
            counter += 1
        except:
            current_date = current_date + timedelta(days=1)
    else:
        print(f"Data Merge Complete, {counter} files have been merged into a dataframe and exported as merged_aggregate_data{date.today()}.csv")
        return combined_df

combined_df = minute_data_aggregation_condenser(start_date_dt, end_date_dt)
combined_df.head()

Data Merge Complete, 545 files have been merged into a dataframe and exported as merged_aggregate_data2022-11-09.csv


Unnamed: 0,channel,time,content_id,country,total_sessions,total_session_duration_seconds,channel;time;content_id;country;total_sessions;total_session_duration_seconds
0,qwestAAAA-qwestclassic-huawei,2021-02-16 00:57:00,PRO_1301#02-DIEENTFUEHRUNG,Finland,1.0,20.0,
1,qwestAAAA-qwestclassic-huawei,2021-02-16 01:22:00,PRO_1326#01-WALDBUEHNE,Morocco,1.0,13.0,
2,qwestAAAA-qwestclassic-huawei,2021-02-16 05:03:00,PRO_1355-297579-IA223008_MATSUEV_CONCERTGEBOUW,Germany,1.0,2.0,
3,qwestAAAA-qwestclassic-huawei,2021-02-16 14:03:00,PRO_1293-305379-5452_Georges_Bizet_Carmen_3053...,Italy,1.0,29.0,
4,qwestAAAA-qwestclassic-huawei,2021-02-16 15:28:00,PRO_1351#02-SHEHERAZADE,Morocco,1.0,44.0,


In [4]:
#check df length
print(len(combined_df))

52436896


In [5]:
# Check columns in df
combined_df.columns


Index(['channel', 'time', 'content_id', 'country', 'total_sessions',
       'total_session_duration_seconds',
       'channel;time;content_id;country;total_sessions;total_session_duration_seconds'],
      dtype='object')

In [6]:
#Extra column named channel;time;content_id;country;total_sessions;total_session_duration_seconds indicates error
# Check that column
len(combined_df.columns)
combined_df['channel;time;content_id;country;total_sessions;total_session_duration_seconds'].value_counts()

amg00447-qwest-qwestclassic-zeasn;26/06/2022 06:29;pro2412enpadalcinavoqwest;India;1;57                              1
qwestAAAA-qwestmix-uk-samsungtv;26/06/2022 04:08;pro2424aorevereseriesmatteomyderwykuhdcleanstereo;Greece;1;180      1
qwestAAAA-qwestmix-uk-samsungtv;26/06/2022 04:07;pro2424aorevereseriesmatteomyderwykuhdcleanstereo;Germany;13;623    1
qwestAAAA-qwestmix-uk-samsungtv;26/06/2022 04:07;pro2424aorevereseriesmatteomyderwykuhdcleanstereo;Spain;18;914      1
qwestAAAA-qwestmix-uk-samsungtv;26/06/2022 04:07;pro2424aorevereseriesmatteomyderwykuhdcleanstereo;Italy;13;551      1
                                                                                                                    ..
qwestAAAA-qwest-mix-lg;26/06/2022 16:34;pro2422aomindagainstzalmhavenv1;Austria;2;106                                1
qwestAAAA-qwest-mix-lg;26/06/2022 16:34;pro2422aomindagainstzalmhavenv1;Brazil;36;1768                               1
qwestAAAA-qwest-mix-lg;26/06/2022 16:34;pro2422a

In [7]:
# remove all rows containing bad data
combined_df = combined_df[pd.isnull(combined_df['channel;time;content_id;country;total_sessions;total_session_duration_seconds'])]

# Column was semicolon separated rather than comma separated on 2022-06-26
semicolon_data = pd.read_csv("Resources/minutelevelsessionaggregations-qwest-2022-06-26.csv", sep=';')
semicolon_df = pd.DataFrame(semicolon_data)
semicolon_df.head()

#combine with complete dataframe
combined_df=pd.concat([combined_df, semicolon_df])

#remove 'channel;time;content_id;country;total_sessions;total_session_duration_seconds' column
combined_df = combined_df.drop(columns=['channel;time;content_id;country;total_sessions;total_session_duration_seconds'])
combined_df.head()



Unnamed: 0,channel,time,content_id,country,total_sessions,total_session_duration_seconds
0,qwestAAAA-qwestclassic-huawei,2021-02-16 00:57:00,PRO_1301#02-DIEENTFUEHRUNG,Finland,1.0,20.0
1,qwestAAAA-qwestclassic-huawei,2021-02-16 01:22:00,PRO_1326#01-WALDBUEHNE,Morocco,1.0,13.0
2,qwestAAAA-qwestclassic-huawei,2021-02-16 05:03:00,PRO_1355-297579-IA223008_MATSUEV_CONCERTGEBOUW,Germany,1.0,2.0
3,qwestAAAA-qwestclassic-huawei,2021-02-16 14:03:00,PRO_1293-305379-5452_Georges_Bizet_Carmen_3053...,Italy,1.0,29.0
4,qwestAAAA-qwestclassic-huawei,2021-02-16 15:28:00,PRO_1351#02-SHEHERAZADE,Morocco,1.0,44.0


In [8]:
#check length of df to make sure it matches with previous 
print(len(combined_df))

52436896


In [9]:
# Output new dataframe to CSV
filepath = Path(f'Resources/merged_aggregate_data-{date.today()}.csv')
combined_df.to_csv(filepath)