# Competitor Analysis Dashboard
## Data Source Audit / Sandbox

In [1]:
import pandas as pd

# Import DataFrame
df = pd.read_csv("./data_source.csv")
# Add learning time (mins.)
df['LT_mins'] = df['visits'] * df['average_visit_duration'] / 60.0
df

Unnamed: 0,group_site,KA_initiative,site_url,site_name,endpoint_category,date,average_visit_duration,visits,LT_mins
0,100Marks,TP,100marks.in,100Marks,mobile-web,2015-06-01,173.083726,2.552191e+04,7.362378e+04
1,100Marks,TP,100marks.in,100Marks,mobile-web,2015-07-01,0.000000,0.000000e+00,0.000000e+00
2,100Marks,TP,100marks.in,100Marks,mobile-web,2015-08-01,16.891649,3.720373e+03,1.047387e+03
3,100Marks,TP,100marks.in,100Marks,mobile-web,2015-09-01,0.000000,0.000000e+00,0.000000e+00
4,100Marks,TP,100marks.in,100Marks,mobile-web,2015-10-01,0.000000,0.000000e+00,0.000000e+00
5,100Marks,TP,100marks.in,100Marks,mobile-web,2015-11-01,7.071567,9.431491e+02,1.111590e+02
6,100Marks,TP,100marks.in,100Marks,mobile-web,2015-12-01,5.398753,1.060558e+03,9.542817e+01
7,100Marks,TP,100marks.in,100Marks,mobile-web,2016-01-01,0.000000,0.000000e+00,0.000000e+00
8,100Marks,TP,100marks.in,100Marks,mobile-web,2016-02-01,0.000000,0.000000e+00,0.000000e+00
9,100Marks,TP,100marks.in,100Marks,mobile-web,2016-03-01,0.000000,0.000000e+00,0.000000e+00


## Normalizing LT (monthly) by selected website

In [2]:
# Create indexed version of data
indexed_df = df.set_index(['group_site',  'KA_initiative', 'site_name', 'site_url', 'endpoint_category', 'date'])

# Fix 'lexsort' issue
indexed_df = indexed_df.sort_index()

# Index of normalizer basis
index_stem = ['Quizlet', 'C&CL', 'Quizlet', 'quizlet.com']

# Write a helper function to then apply to each rows 
def normalizer(df, x, index_stem, endpoint_category, date, column):
    normalizer_data = df.loc[tuple(index_stem + [endpoint_category] + [date]),['LT_mins']]
    return x / normalizer_data

# Execute apply
df['norm_LT'] = df.apply(lambda row: normalizer(indexed_df, row['LT_mins'], index_stem, row['endpoint_category'], row['date'], 'LT_mins'), axis=1)

# Check to see if normalizing site == 1.0 for 'norm_LT'
df[df['group_site'].isin(['Quizlet'])]

Unnamed: 0,group_site,KA_initiative,site_url,site_name,endpoint_category,date,average_visit_duration,visits,LT_mins,norm_LT
5475,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2015-06-01,302.196480,2.050892e+07,1.032954e+08,1.0
5476,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2015-07-01,300.070502,1.399823e+07,7.000757e+07,1.0
5477,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2015-08-01,243.061530,1.891870e+07,7.664015e+07,1.0
5478,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2015-09-01,274.511694,4.247930e+07,1.943511e+08,1.0
5479,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2015-10-01,294.973233,4.474326e+07,2.199677e+08,1.0
5480,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2015-11-01,312.301235,4.292575e+07,2.234294e+08,1.0
5481,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2015-12-01,323.523261,3.183612e+07,1.716621e+08,1.0
5482,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2016-01-01,295.192330,4.296252e+07,2.113701e+08,1.0
5483,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2016-02-01,303.273254,4.925770e+07,2.489757e+08,1.0
5484,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2016-03-01,304.493741,5.218161e+07,2.648162e+08,1.0


In [3]:
df.shape

(7425, 10)

## TTM Calculations
See below for 'proof of concept' which does calculation for only one site.

_Check datum_: **Quizlet,...,mobile-web, 2017-05-01**

In [4]:
# Drop dates from index keys; returns to column
indexed_df2 = indexed_df.reset_index(level=-1, inplace=False)

In [5]:
# Proof of concept
mini_df = indexed_df2.loc[('Quizlet', 'C&CL', 'Quizlet', 'quizlet.com',),:]
mini_df

Unnamed: 0_level_0,date,average_visit_duration,visits,LT_mins
endpoint_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mobile-web,2015-06-01,302.196480,2.050892e+07,1.032954e+08
mobile-web,2015-07-01,300.070502,1.399823e+07,7.000757e+07
mobile-web,2015-08-01,243.061530,1.891870e+07,7.664015e+07
mobile-web,2015-09-01,274.511694,4.247930e+07,1.943511e+08
mobile-web,2015-10-01,294.973233,4.474326e+07,2.199677e+08
mobile-web,2015-11-01,312.301235,4.292575e+07,2.234294e+08
mobile-web,2015-12-01,323.523261,3.183612e+07,1.716621e+08
mobile-web,2016-01-01,295.192330,4.296252e+07,2.113701e+08
mobile-web,2016-02-01,303.273254,4.925770e+07,2.489757e+08
mobile-web,2016-03-01,304.493741,5.218161e+07,2.648162e+08


In [6]:
# Proof of concept (cont.)
test = mini_df.loc['mobile-web'].rolling(window=12, min_periods=12).sum()
test.loc['mobile-web'][test.loc['mobile-web']['date'] == '2017-05-01']

Unnamed: 0_level_0,date,average_visit_duration,visits,LT_mins
endpoint_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mobile-web,2017-05-01,2692.547347,474056600.0,1801160000.0


### TTM (Sum): Method 0

In [7]:
# Calculates rolling averages based on TTM...but runs on.
# Fix by NaN-ing the first 11 entries? Will have problems for missing dates!
TTMsum_df = indexed_df.rolling(window=12, min_periods=12).sum()
TTMsum_df = TTMsum_df.sort_index()
TTMsum_df.loc[('Quizlet', 'C&CL', 'Quizlet', 'quizlet.com', 'mobile-web','2017-05-01'),:]

average_visit_duration    2.692547e+03
visits                    4.740566e+08
LT_mins                   1.801160e+09
Name: (Quizlet, C&CL, Quizlet, quizlet.com, mobile-web, 2017-05-01), dtype: float64

In [8]:
# Checks the TTM average function (should be same as cell above)
indexed_df.loc[('Quizlet',slice(None),slice(None),slice(None),'mobile-web',slice('2016-06-01','2017-05-01')),].sum()

average_visit_duration    2.692547e+03
visits                    4.740566e+08
LT_mins                   1.801160e+09
dtype: float64

### TTM (Sum): Method 1
Extending the 'proof of concept' for entire set of data.

In [9]:
# Drop 'endpoint_category' to column; get keys to iterate through
indexed_df = indexed_df2.reset_index(level=-1, inplace=False)
keys = list(set(indexed_df.index.values))

# Turn off SettingWithCopyWarning
pd.options.mode.chained_assignment = None

# Do TTMsum calculations
for key in keys:
    site_info = indexed_df.loc[key,:]
    site_info.set_index(['endpoint_category'], inplace=True)
    endpoint_categories = list(set(site_info.index.values))
    for endpoint_category in endpoint_categories:
        site_info.loc[endpoint_category] = site_info.loc[endpoint_category].rolling(window=12, min_periods=12).sum()
        
TTMsum_df = indexed_df.reset_index()
TTMsum_df['average_visit_duration_mins'] = TTMsum_df['LT_mins'] / TTMsum_df['visits']
TTMsum_df.set_index(['date'], inplace=True)
TTMsum_df[(TTMsum_df['group_site'] == 'Quizlet') & (TTMsum_df['endpoint_category'] == 'mobile-web')]

Unnamed: 0_level_0,group_site,KA_initiative,site_name,site_url,endpoint_category,average_visit_duration,visits,LT_mins,average_visit_duration_mins
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-06-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,
2015-07-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,
2015-08-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,
2015-09-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,
2015-10-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,
2015-11-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,
2015-12-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,
2016-01-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,
2016-02-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,
2016-03-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,


In [10]:
# Checking to see that the first 11 months are devoid of data
x = TTMsum_df.sort_index().loc[:'2016-04-01',['visits', 'average_visit_duration', 'LT_mins']]
x.isnull().all()

visits                    True
average_visit_duration    True
LT_mins                   True
dtype: bool

In [11]:
# Checking to see comprehensiveness of records (by date)
date_check_df = df.groupby(['group_site', 'KA_initiative', 'site_name', 'site_url', 'endpoint_category']).count()
date_check_df['date'].describe()

count    297.0
mean      25.0
std        0.0
min       25.0
25%       25.0
50%       25.0
75%       25.0
max       25.0
Name: date, dtype: float64

Generalized function which takes an unindexed dataframe and outputs an unindexed dataframe with added columns for TTM calcuations (float).

In [12]:
def TTMer(df, func):

    indexed_df = df.set_index(['group_site',  'KA_initiative', 'site_name', 'site_url', 'endpoint_category', 'date'])

    # Save original indexed_df to join with calculated dataframe later
    og_indexed_df = indexed_df.copy()
    
    # Drop 'endpoint_category' to column; get keys to iterate through    
    indexed_df = indexed_df.reset_index(level=(-1,-2), inplace=False)
    indexed_df.sort_index(inplace=True)
    keys = list(set(indexed_df.index.values))

    # Turn off pandas' SettingWithCopyWarning
    pd.options.mode.chained_assignment = None

    # Do TTM calculations
    for key in keys:
        site_info = indexed_df.loc[key,:]
        site_info.set_index(['endpoint_category'], inplace=True)
        endpoint_categories = list(set(site_info.index.values))
        for endpoint_category in endpoint_categories:
            site_info.loc[endpoint_category] = func(site_info.loc[endpoint_category].rolling(window=12, min_periods=12))

    # Set indices for join
    indexed_df.reset_index(inplace=True)
    indexed_df.set_index(['group_site',  'KA_initiative', 'site_name', 'site_url', 'endpoint_category', 'date'], inplace=True)

    print('Successfully wrote TTM data!')
    return indexed_df

x_df = TTMer(df, pd.core.window.Rolling.sum)
x_df.loc['Quizlet']

Successfully wrote TTM data!


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,average_visit_duration,visits,LT_mins,norm_LT
KA_initiative,site_name,site_url,endpoint_category,date,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
C&CL,Quizlet,quizlet.com,mobile-web,2015-06-01,,,,
C&CL,Quizlet,quizlet.com,mobile-web,2015-07-01,,,,
C&CL,Quizlet,quizlet.com,mobile-web,2015-08-01,,,,
C&CL,Quizlet,quizlet.com,mobile-web,2015-09-01,,,,
C&CL,Quizlet,quizlet.com,mobile-web,2015-10-01,,,,
C&CL,Quizlet,quizlet.com,mobile-web,2015-11-01,,,,
C&CL,Quizlet,quizlet.com,mobile-web,2015-12-01,,,,
C&CL,Quizlet,quizlet.com,mobile-web,2016-01-01,,,,
C&CL,Quizlet,quizlet.com,mobile-web,2016-02-01,,,,
C&CL,Quizlet,quizlet.com,mobile-web,2016-03-01,,,,


Using the above function, we can do string of joins to make the final dataframe!

In [13]:
def TTMdf_joiner(df, *args):

    indexed_df = df.set_index(['group_site',  'KA_initiative', 'site_name', 'site_url', 'endpoint_category', 'date'])
    
    # Generate 'TTMdf's; add to list
    TTMdf_dict = {}
    for func in args:
        TTMdf_dict[func.__name__] = TTMer(df, func)
    
    # Do joins into final dataframe
    for k, v in TTMdf_dict.items():
        indexed_df = indexed_df.join(v, how='left', rsuffix='_TTM_'+k)
        
    return indexed_df.reset_index()

monthly_and_TTM_table = TTMdf_joiner(df, pd.core.window.Rolling.sum, pd.core.window.Rolling.mean)
monthly_and_TTM_table[monthly_and_TTM_table['group_site'] == 'Quizlet']

Successfully wrote TTM data!
Successfully wrote TTM data!


Unnamed: 0,group_site,KA_initiative,site_name,site_url,endpoint_category,date,average_visit_duration,visits,LT_mins,norm_LT,average_visit_duration_TTM_sum,visits_TTM_sum,LT_mins_TTM_sum,norm_LT_TTM_sum,average_visit_duration_TTM_mean,visits_TTM_mean,LT_mins_TTM_mean,norm_LT_TTM_mean
5475,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,2015-06-01,302.196480,2.050892e+07,1.032954e+08,1.0,,,,,,,,
5476,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,2015-07-01,300.070502,1.399823e+07,7.000757e+07,1.0,,,,,,,,
5477,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,2015-08-01,243.061530,1.891870e+07,7.664015e+07,1.0,,,,,,,,
5478,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,2015-09-01,274.511694,4.247930e+07,1.943511e+08,1.0,,,,,,,,
5479,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,2015-10-01,294.973233,4.474326e+07,2.199677e+08,1.0,,,,,,,,
5480,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,2015-11-01,312.301235,4.292575e+07,2.234294e+08,1.0,,,,,,,,
5481,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,2015-12-01,323.523261,3.183612e+07,1.716621e+08,1.0,,,,,,,,
5482,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,2016-01-01,295.192330,4.296252e+07,2.113701e+08,1.0,,,,,,,,
5483,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,2016-02-01,303.273254,4.925770e+07,2.489757e+08,1.0,,,,,,,,
5484,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,2016-03-01,304.493741,5.218161e+07,2.648162e+08,1.0,,,,,,,,


## % Y/Y Growth (TTM, Monthly)
Generalized function which takes an unindexed dataframe and outputs an unindexed dataframe while adding columns for YoY calcuations (float).

In [14]:
# Dependency for doing date math
import datetime
from dateutil.relativedelta import relativedelta

def yoyer(df):
    # Index the dataframe so only endpoints are column values
    indexed_df = df.set_index(['group_site',  'KA_initiative', 'site_name', 'site_url', 'endpoint_category', 'date'])
    indexed_df.sortlevel(inplace=True)

    #Helper function to perform % y/y calculation
    def yoy_calculator(df, row, col):
        key = list(row.name)
        curr_row_date = datetime.datetime.strptime(key.pop(), '%Y-%m-%d').date()
        base_date = curr_row_date + relativedelta(years=-1)
        base_key = tuple(key + [str(base_date)])
        if base_key in df.index:
            year_0 = df.loc[base_key, col]
            year_1 = df.loc[tuple(key+[str(curr_row_date)]), col]
            if year_0 != 0.0:
                try:
                    pct_chg = (year_1/year_0) - 1
                    return pct_chg
                except:
                    return
            else:
                return
        else:
            return

    # Execute apply of 'yoy_calculator' by row
    for col in indexed_df.columns:    
        indexed_df[col+'_pct_yoy'] = indexed_df.apply(lambda x: yoy_calculator(indexed_df, x, col), axis=1)

    # Return dataframe (unindexed)
    print('Successfully wrote YOY columns!')
    return indexed_df.reset_index()

%time final = yoyer(monthly_and_TTM_table)
final.set_index(['group_site',  'KA_initiative', 'site_name', 'site_url', 'endpoint_category', 'date'], inplace=True)
final.loc[('Quizlet', 'C&CL', 'Quizlet', 'quizlet.com', 'mobile-web',),('visits_TTM_sum','visits_TTM_sum_pct_yoy')]

Successfully wrote YOY columns!
CPU times: user 29.2 s, sys: 70.6 ms, total: 29.3 s
Wall time: 29.3 s


Unnamed: 0_level_0,visits_TTM_sum,visits_TTM_sum_pct_yoy
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-06-01,,
2015-07-01,,
2015-08-01,,
2015-09-01,,
2015-10-01,,
2015-11-01,,
2015-12-01,,
2016-01-01,,
2016-02-01,,
2016-03-01,,


# Checking 'Incremental' Updates
This involved checking the outfiles for inconsistencies, when the 'incremental' update should not return new time series information.

In [15]:
# Files to compare
og_out = pd.read_csv('./outfiles/2017-07-24_14:00.csv')
new_out = pd.read_csv('./outfiles/2017-07-24_14:40.csv')

merged = og_out.merge(new_out, indicator=True, how='outer')

# Records only in the newer outfile
merged[merged['_merge'] == 'right_only']

Unnamed: 0,group_site,KA_initiative,site_name,site_url,endpoint_category,date,average_visit_duration,visits,LT_mins,norm_LT,...,norm_LT_pct_yoy,average_visit_duration_TTM_sum_pct_yoy,visits_TTM_sum_pct_yoy,LT_mins_TTM_sum_pct_yoy,norm_LT_TTM_sum_pct_yoy,average_visit_duration_TTM_mean_pct_yoy,visits_TTM_mean_pct_yoy,LT_mins_TTM_mean_pct_yoy,norm_LT_TTM_mean_pct_yoy,_merge
225,Coursera,C&CL,Coursera,coursera.org,mobile-web,2015-06-01,217.960324,7.509706e+06,2.728030e+07,0.592697,...,,,,,,,,,,right_only
226,Coursera,C&CL,Coursera,coursera.org,mobile-web,2015-07-01,207.401516,7.009313e+06,2.422903e+07,0.655472,...,,,,,,,,,,right_only
227,Coursera,C&CL,Coursera,coursera.org,mobile-web,2015-08-01,152.464156,7.078848e+06,1.798784e+07,0.310437,...,,,,,,,,,,right_only
228,Coursera,C&CL,Coursera,coursera.org,mobile-web,2015-09-01,174.105831,7.888630e+06,2.289094e+07,0.266764,...,,,,,,,,,,right_only
229,Coursera,C&CL,Coursera,coursera.org,mobile-web,2015-10-01,173.981189,7.514806e+06,2.179058e+07,0.308401,...,,,,,,,,,,right_only
230,Coursera,C&CL,Coursera,coursera.org,mobile-web,2015-11-01,184.591990,6.115295e+06,1.881391e+07,0.237989,...,,,,,,,,,,right_only
231,Coursera,C&CL,Coursera,coursera.org,mobile-web,2015-12-01,180.696838,5.531967e+06,1.666015e+07,0.313644,...,,,,,,,,,,right_only
232,Coursera,C&CL,Coursera,coursera.org,mobile-web,2016-01-01,196.797209,7.565236e+06,2.481362e+07,0.390274,...,,,,,,,,,,right_only
233,Coursera,C&CL,Coursera,coursera.org,mobile-web,2016-02-01,197.271394,7.677557e+06,2.524271e+07,0.380799,...,,,,,,,,,,right_only
234,Coursera,C&CL,Coursera,coursera.org,mobile-web,2016-03-01,211.495668,8.019324e+06,2.826754e+07,0.351304,...,,,,,,,,,,right_only


In [16]:
# Checking to see if there were any records only in the original outfile (i.e., not in the new outfile)
set(merged['_merge'].values)

{'both', 'left_only', 'right_only'}

In [17]:
new_out.groupby(['site_url']).count()

Unnamed: 0_level_0,group_site,KA_initiative,site_name,endpoint_category,date,average_visit_duration,visits,LT_mins,norm_LT,average_visit_duration_TTM_sum,...,LT_mins_pct_yoy,norm_LT_pct_yoy,average_visit_duration_TTM_sum_pct_yoy,visits_TTM_sum_pct_yoy,LT_mins_TTM_sum_pct_yoy,norm_LT_TTM_sum_pct_yoy,average_visit_duration_TTM_mean_pct_yoy,visits_TTM_mean_pct_yoy,LT_mins_TTM_mean_pct_yoy,norm_LT_TTM_mean_pct_yoy
site_url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
coursera.org,75,75,75,75,75,75,75,75,75,42,...,39,39,6,6,6,6,6,6,6,6
kahoot.com,75,75,75,75,75,75,75,75,75,42,...,39,39,6,6,6,6,6,6,6,6
kahoot.it,75,75,75,75,75,75,75,75,75,42,...,39,39,6,6,6,6,6,6,6,6
khanacademy.org,75,75,75,75,75,75,75,75,75,42,...,39,39,6,6,6,6,6,6,6,6
quizlet.com,75,75,75,75,75,75,75,75,75,42,...,39,39,6,6,6,6,6,6,6,6


# GroupBy (pre-transformation)
This is an experiment to see if doing GroupBy by 'group_site' works before doing dataframe-level transformations.

In [18]:
# Import DataFrame
df = pd.read_csv("./data_source.csv")
# Add learning time (mins.)
df['LT_mins'] = df['visits'] * df['average_visit_duration'] / 60.0

# GroupBy 'group_site'...
df = df.groupby(['group_site', 'KA_initiative', 'endpoint_category', 'date']).sum()
df['average_visit_duration'] = df['LT_mins'] / df['visits']
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,average_visit_duration,visits,LT_mins
group_site,KA_initiative,endpoint_category,date,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100Marks,TP,mobile-web,2015-06-01,2.884729,2.552191e+04,7.362378e+04
100Marks,TP,mobile-web,2015-07-01,,0.000000e+00,0.000000e+00
100Marks,TP,mobile-web,2015-08-01,0.281527,3.720373e+03,1.047387e+03
100Marks,TP,mobile-web,2015-09-01,,0.000000e+00,0.000000e+00
100Marks,TP,mobile-web,2015-10-01,,0.000000e+00,0.000000e+00
100Marks,TP,mobile-web,2015-11-01,0.117859,9.431491e+02,1.111590e+02
100Marks,TP,mobile-web,2015-12-01,0.089979,1.060558e+03,9.542817e+01
100Marks,TP,mobile-web,2016-01-01,,0.000000e+00,0.000000e+00
100Marks,TP,mobile-web,2016-02-01,,0.000000e+00,0.000000e+00
100Marks,TP,mobile-web,2016-03-01,,0.000000e+00,0.000000e+00


## Normalizing LT (monthly) by selected 'group_site'

In [19]:
# Create indexed version of data
indexed_df = df #.set_index(['group_site', 'KA_initiative', 'endpoint_category', 'date'])

# Fix 'lexsort' issue
indexed_df = indexed_df.sort_index()

# De-index 'df'
df.reset_index(inplace=True)

# Index of normalizer basis
index_stem = ['Quizlet', 'C&CL']

# Write a helper function to then apply to each rows 
def normalizer(df, x, index_stem, endpoint_category, date, column):
    normalizer_data = df.loc[tuple(index_stem + [endpoint_category] + [date]),['LT_mins']]
    return x / normalizer_data

# Execute apply
df['norm_LT'] = df.apply(lambda row: normalizer(indexed_df, row['LT_mins'], index_stem, row['endpoint_category'], row['date'], 'LT_mins'), axis=1)

# Check to see if normalizing site == 1.0 for 'norm_LT'
df[df['group_site'].isin(['Quizlet'])]

Unnamed: 0,group_site,KA_initiative,endpoint_category,date,average_visit_duration,visits,LT_mins,norm_LT
4275,Quizlet,C&CL,mobile-web,2015-06-01,5.036608,2.050892e+07,1.032954e+08,1.0
4276,Quizlet,C&CL,mobile-web,2015-07-01,5.001175,1.399823e+07,7.000757e+07,1.0
4277,Quizlet,C&CL,mobile-web,2015-08-01,4.051026,1.891870e+07,7.664015e+07,1.0
4278,Quizlet,C&CL,mobile-web,2015-09-01,4.575195,4.247930e+07,1.943511e+08,1.0
4279,Quizlet,C&CL,mobile-web,2015-10-01,4.916221,4.474326e+07,2.199677e+08,1.0
4280,Quizlet,C&CL,mobile-web,2015-11-01,5.205021,4.292575e+07,2.234294e+08,1.0
4281,Quizlet,C&CL,mobile-web,2015-12-01,5.392054,3.183612e+07,1.716621e+08,1.0
4282,Quizlet,C&CL,mobile-web,2016-01-01,4.919872,4.296252e+07,2.113701e+08,1.0
4283,Quizlet,C&CL,mobile-web,2016-02-01,5.054554,4.925770e+07,2.489757e+08,1.0
4284,Quizlet,C&CL,mobile-web,2016-03-01,5.074896,5.218161e+07,2.648162e+08,1.0


In [20]:
# Check to see if aggregations maintained after de-indexing
df[(df['group_site'].isin(['Brainly'])) & (df['endpoint_category'] == 'total-traffic-and-engagement')]

Unnamed: 0,group_site,KA_initiative,endpoint_category,date,average_visit_duration,visits,LT_mins,norm_LT
625,Brainly,C&CL,total-traffic-and-engagement,2015-06-01,5.849557,36846800.0,215537400.0,0.656711
626,Brainly,C&CL,total-traffic-and-engagement,2015-07-01,4.853453,16898240.0,82014810.0,0.31849
627,Brainly,C&CL,total-traffic-and-engagement,2015-08-01,5.414961,38077510.0,206188200.0,0.789097
628,Brainly,C&CL,total-traffic-and-engagement,2015-09-01,6.036758,107683500.0,650059000.0,1.094986
629,Brainly,C&CL,total-traffic-and-engagement,2015-10-01,6.40882,108144400.0,693078000.0,0.979154
630,Brainly,C&CL,total-traffic-and-engagement,2015-11-01,6.643833,110094000.0,731446100.0,1.09014
631,Brainly,C&CL,total-traffic-and-engagement,2015-12-01,6.377212,80076460.0,510664500.0,0.919304
632,Brainly,C&CL,total-traffic-and-engagement,2016-01-01,6.078016,86556540.0,526092000.0,0.95499
633,Brainly,C&CL,total-traffic-and-engagement,2016-02-01,6.524236,79248140.0,517033600.0,0.768594
634,Brainly,C&CL,total-traffic-and-engagement,2016-03-01,6.598297,97423550.0,642829400.0,0.914632


In [21]:
df.shape

(6225, 8)

## TTM Calculations (by selected 'group_site')
Adjusted aggregation functions.

In [22]:
def TTMer(df, func):

    indexed_df = df.set_index(['group_site',  'KA_initiative', 'endpoint_category', 'date'])

    # Save original indexed_df to join with calculated dataframe later
    og_indexed_df = indexed_df.copy()
    
    # Drop 'endpoint_category' to column; get keys to iterate through    
    indexed_df = indexed_df.reset_index(level=(-1,-2), inplace=False)
    indexed_df.sort_index(inplace=True)
    keys = list(set(indexed_df.index.values))

    # Turn off pandas' SettingWithCopyWarning
    pd.options.mode.chained_assignment = None

    # Do TTM calculations
    for key in keys:
        site_info = indexed_df.loc[key,:]
        site_info.set_index(['endpoint_category'], inplace=True)
        endpoint_categories = list(set(site_info.index.values))
        for endpoint_category in endpoint_categories:
            site_info.loc[endpoint_category] = func(site_info.loc[endpoint_category].rolling(window=12, min_periods=12))

    # Set indices for join
    indexed_df.reset_index(inplace=True)
    indexed_df.set_index(['group_site',  'KA_initiative', 'endpoint_category', 'date'], inplace=True)

    print('Successfully wrote TTM data!')
    return indexed_df

x_df = TTMer(df, pd.core.window.Rolling.sum)
x_df.loc['Brainly']

Successfully wrote TTM data!


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,average_visit_duration,visits,LT_mins,norm_LT
KA_initiative,endpoint_category,date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C&CL,mobile-web,2015-06-01,,,,
C&CL,mobile-web,2015-07-01,,,,
C&CL,mobile-web,2015-08-01,,,,
C&CL,mobile-web,2015-09-01,,,,
C&CL,mobile-web,2015-10-01,,,,
C&CL,mobile-web,2015-11-01,,,,
C&CL,mobile-web,2015-12-01,,,,
C&CL,mobile-web,2016-01-01,,,,
C&CL,mobile-web,2016-02-01,,,,
C&CL,mobile-web,2016-03-01,,,,


In [23]:
def TTMdf_joiner(df, *args):

    indexed_df = df.set_index(['group_site',  'KA_initiative', 'endpoint_category', 'date'])
    
    # Generate 'TTMdf's; add to list
    TTMdf_dict = {}
    for func in args:
        TTMdf_dict[func.__name__] = TTMer(df, func)
    
    # Do joins into final dataframe
    for k, v in TTMdf_dict.items():
        indexed_df = indexed_df.join(v, how='left', rsuffix='_TTM_'+k)
        
    return indexed_df.reset_index()

monthly_and_TTM_table = TTMdf_joiner(df, pd.core.window.Rolling.sum)#, pd.core.window.Rolling.mean)
monthly_and_TTM_table[monthly_and_TTM_table['group_site'] == 'Brainly']

Successfully wrote TTM data!


Unnamed: 0,group_site,KA_initiative,endpoint_category,date,average_visit_duration,visits,LT_mins,norm_LT,average_visit_duration_TTM_sum,visits_TTM_sum,LT_mins_TTM_sum,norm_LT_TTM_sum
600,Brainly,C&CL,mobile-web,2015-06-01,5.194398,1.745161e+07,9.065061e+07,0.877586,,,,
601,Brainly,C&CL,mobile-web,2015-07-01,4.497938,8.549472e+06,3.845499e+07,0.549298,,,,
602,Brainly,C&CL,mobile-web,2015-08-01,4.969602,2.373032e+07,1.179303e+08,1.538753,,,,
603,Brainly,C&CL,mobile-web,2015-09-01,4.987698,5.111538e+07,2.549481e+08,1.311791,,,,
604,Brainly,C&CL,mobile-web,2015-10-01,5.283996,4.938077e+07,2.609277e+08,1.186209,,,,
605,Brainly,C&CL,mobile-web,2015-11-01,5.551565,5.141593e+07,2.854388e+08,1.277535,,,,
606,Brainly,C&CL,mobile-web,2015-12-01,5.159008,3.590940e+07,1.852569e+08,1.079195,,,,
607,Brainly,C&CL,mobile-web,2016-01-01,5.277785,4.832333e+07,2.550401e+08,1.206605,,,,
608,Brainly,C&CL,mobile-web,2016-02-01,5.553420,3.960167e+07,2.199247e+08,0.883318,,,,
609,Brainly,C&CL,mobile-web,2016-03-01,5.726527,5.377788e+07,3.079605e+08,1.162922,,,,


In [24]:
monthly_and_TTM_table.shape

(6225, 12)

## % Y/Y Growth (by selected 'group_site')
Adjusted aggregate functions.

In [25]:
def yoyer(df):
    # Index the dataframe so only endpoints are column values
    indexed_df = df.set_index(['group_site', 'KA_initiative', 'endpoint_category', 'date'])
    indexed_df.sortlevel(inplace=True)

    #Helper function to perform % y/y calculation
    def yoy_calculator(df, row, col):
        key = list(row.name)
        curr_row_date = datetime.datetime.strptime(key.pop(), '%Y-%m-%d').date()
        base_date = curr_row_date + relativedelta(years=-1)
        base_key = tuple(key + [str(base_date)])
        if base_key in df.index:
            year_0 = df.loc[base_key, col]
            year_1 = df.loc[tuple(key+[str(curr_row_date)]), col]
            if year_0 != 0.0:
                try:
                    pct_chg = (year_1/year_0) - 1
                    return pct_chg
                except:
                    return
            else:
                return
        else:
            return

    # Execute apply of 'yoy_calculator' by row
    for col in indexed_df.columns:    
        indexed_df[col+'_pct_yoy'] = indexed_df.apply(lambda x: yoy_calculator(indexed_df, x, col), axis=1)

    # Return dataframe (unindexed)
    print('Successfully wrote YOY columns!')
    return indexed_df.reset_index()

%time final = yoyer(monthly_and_TTM_table)
final.set_index(['group_site',  'KA_initiative', 'endpoint_category', 'date'], inplace=True)
final.loc[('Brainly', 'C&CL', 'mobile-web',),('visits_TTM_sum','visits_TTM_sum_pct_yoy')]

Successfully wrote YOY columns!
CPU times: user 15.8 s, sys: 32.4 ms, total: 15.8 s
Wall time: 15.8 s


Unnamed: 0_level_0,visits_TTM_sum,visits_TTM_sum_pct_yoy
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-06-01,,
2015-07-01,,
2015-08-01,,
2015-09-01,,
2015-10-01,,
2015-11-01,,
2015-12-01,,
2016-01-01,,
2016-02-01,,
2016-03-01,,


## % M/M Growth (by selected 'group_site')
Adjusted 'YoYer' function for month-on-month growth (interesting for TTM trend lines).

In [28]:
def momer(df):
    # Index the dataframe so only endpoints are column values
    indexed_df = df.set_index(['group_site', 'KA_initiative', 'endpoint_category', 'date'])
    indexed_df.sortlevel(inplace=True)

    #Helper function to perform % m/m calculation
    def mom_calculator(df, row, col):
        key = list(row.name)
        curr_row_date = datetime.datetime.strptime(key.pop(), '%Y-%m-%d').date()
        base_date = curr_row_date + relativedelta(months=-1)
        base_key = tuple(key + [str(base_date)])
        if base_key in df.index:
            month_0 = df.loc[base_key, col]
            month_1 = df.loc[tuple(key+[str(curr_row_date)]), col]
            if month_0 != 0.0:
                try:
                    pct_chg = (month_1/month_0) - 1
                    return pct_chg
                except:
                    return
            else:
                return
        else:
            return

    # Execute apply of 'yoy_calculator' by row
    for col in indexed_df.columns:    
        indexed_df[col+'_pct_mom'] = indexed_df.apply(lambda x: mom_calculator(indexed_df, x, col), axis=1)

    # Return dataframe (unindexed)
    print('Successfully wrote MOM columns!')
    return indexed_df.reset_index()

%time final = momer(monthly_and_TTM_table)
final.set_index(['group_site',  'KA_initiative', 'endpoint_category', 'date'], inplace=True)
final.loc[('Brainly', 'C&CL', 'mobile-web',),('visits_TTM_sum','visits_TTM_sum_pct_mom')]

Successfully wrote MOM columns!
CPU times: user 23.8 s, sys: 27 ms, total: 23.9 s
Wall time: 23.9 s


Unnamed: 0_level_0,visits_TTM_sum,visits_TTM_sum_pct_mom
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-06-01,,
2015-07-01,,
2015-08-01,,
2015-09-01,,
2015-10-01,,
2015-11-01,,
2015-12-01,,
2016-01-01,,
2016-02-01,,
2016-03-01,,
