# Competitor Analysis Dashboard
## Data Source Audit / Sandbox

In [289]:
import pandas as pd

# Import DataFrame
df = pd.read_csv("./data_source.csv")

# Add learning time (mins.)
df['LT_mins'] = df['visits'] * df['average_visit_duration'] / 60.0

# Examine NaN rows
nan_rows = df[df.isnull().T.any().T]
nan_rows

Unnamed: 0,group_site,KA_initiative,site_url,site_name,endpoint_category,date,average_visit_duration,visits,LT_mins
26,100Marks,TP,100marks.in,100Marks,total-traffic-and-engagement,2015-07-01,,0.0,
28,100Marks,TP,100marks.in,100Marks,total-traffic-and-engagement,2015-09-01,,0.0,
29,100Marks,TP,100marks.in,100Marks,total-traffic-and-engagement,2015-10-01,,0.0,
32,100Marks,TP,100marks.in,100Marks,total-traffic-and-engagement,2016-01-01,,0.0,
33,100Marks,TP,100marks.in,100Marks,total-traffic-and-engagement,2016-02-01,,0.0,
34,100Marks,TP,100marks.in,100Marks,total-traffic-and-engagement,2016-03-01,,0.0,
35,100Marks,TP,100marks.in,100Marks,total-traffic-and-engagement,2016-04-01,,0.0,
36,100Marks,TP,100marks.in,100Marks,total-traffic-and-engagement,2016-05-01,,0.0,
42,100Marks,TP,100marks.in,100Marks,total-traffic-and-engagement,2016-11-01,,0.0,
43,100Marks,TP,100marks.in,100Marks,total-traffic-and-engagement,2016-12-01,,0.0,


## Normalizing LT (monthly) by selected website

In [290]:
# View normalizing website info
norm_df = df[df["group_site"] == "Quizlet"]
norm_df

Unnamed: 0,group_site,KA_initiative,site_url,site_name,endpoint_category,date,average_visit_duration,visits,LT_mins
5475,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2015-06-01,302.196480,2.050892e+07,1.032954e+08
5476,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2015-07-01,300.070502,1.399823e+07,7.000757e+07
5477,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2015-08-01,243.061530,1.891870e+07,7.664015e+07
5478,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2015-09-01,274.511694,4.247930e+07,1.943511e+08
5479,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2015-10-01,294.973233,4.474326e+07,2.199677e+08
5480,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2015-11-01,312.301235,4.292575e+07,2.234294e+08
5481,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2015-12-01,323.523261,3.183612e+07,1.716621e+08
5482,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2016-01-01,295.192330,4.296252e+07,2.113701e+08
5483,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2016-02-01,303.273254,4.925770e+07,2.489757e+08
5484,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2016-03-01,304.493741,5.218161e+07,2.648162e+08


In [291]:
# Create indexed version of data
indexed_df = df.set_index(['group_site',  'KA_initiative', 'site_name', 'site_url', 'endpoint_category', 'date'])

# Fix 'lexsort' issue
indexed_df = indexed_df.sort_index()

# Index of normalizer basis
index_stem = ['Quizlet', 'C&CL', 'Quizlet', 'quizlet.com']

# Write a helper function to then apply to each rows 
def normalizer(df, x, index_stem, endpoint_category, date, column):
    normalizer_data = df.loc[tuple(index_stem + [endpoint_category] + [date]),['LT_mins']]
    return x / normalizer_data

# Execute apply
df['norm_LT'] = df.apply(lambda row: normalizer(indexed_df, row['LT_mins'], index_stem, row['endpoint_category'], row['date'], 'LT_mins'), axis=1)

# Check to see if normalizing site == 1.0 for 'norm_LT'
df[df['group_site'].isin(['Quizlet'])]

Unnamed: 0,group_site,KA_initiative,site_url,site_name,endpoint_category,date,average_visit_duration,visits,LT_mins,norm_LT
5475,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2015-06-01,302.196480,2.050892e+07,1.032954e+08,1.0
5476,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2015-07-01,300.070502,1.399823e+07,7.000757e+07,1.0
5477,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2015-08-01,243.061530,1.891870e+07,7.664015e+07,1.0
5478,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2015-09-01,274.511694,4.247930e+07,1.943511e+08,1.0
5479,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2015-10-01,294.973233,4.474326e+07,2.199677e+08,1.0
5480,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2015-11-01,312.301235,4.292575e+07,2.234294e+08,1.0
5481,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2015-12-01,323.523261,3.183612e+07,1.716621e+08,1.0
5482,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2016-01-01,295.192330,4.296252e+07,2.113701e+08,1.0
5483,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2016-02-01,303.273254,4.925770e+07,2.489757e+08,1.0
5484,Quizlet,C&CL,quizlet.com,Quizlet,mobile-web,2016-03-01,304.493741,5.218161e+07,2.648162e+08,1.0


## TTM Calculations
See below for 'proof of concept' which does calculation for only one site.

_Check datum_: **Quizlet,...,mobile-web, 2017-05-01**

In [292]:
# Drop dates from index keys; returns to column
indexed_df2 = indexed_df.reset_index(level=-1, inplace=False)

In [293]:
# Proof of concept
mini_df = indexed_df2.loc[('Quizlet', 'C&CL', 'Quizlet', 'quizlet.com',),:]
mini_df

Unnamed: 0_level_0,date,average_visit_duration,visits,LT_mins
endpoint_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mobile-web,2015-06-01,302.196480,2.050892e+07,1.032954e+08
mobile-web,2015-07-01,300.070502,1.399823e+07,7.000757e+07
mobile-web,2015-08-01,243.061530,1.891870e+07,7.664015e+07
mobile-web,2015-09-01,274.511694,4.247930e+07,1.943511e+08
mobile-web,2015-10-01,294.973233,4.474326e+07,2.199677e+08
mobile-web,2015-11-01,312.301235,4.292575e+07,2.234294e+08
mobile-web,2015-12-01,323.523261,3.183612e+07,1.716621e+08
mobile-web,2016-01-01,295.192330,4.296252e+07,2.113701e+08
mobile-web,2016-02-01,303.273254,4.925770e+07,2.489757e+08
mobile-web,2016-03-01,304.493741,5.218161e+07,2.648162e+08


In [294]:
# Proof of concept (cont.)
test = mini_df.loc['mobile-web'].rolling(window=12, min_periods=12).sum()
test.loc['mobile-web'][test.loc['mobile-web']['date'] == '2017-05-01']

Unnamed: 0_level_0,date,average_visit_duration,visits,LT_mins
endpoint_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mobile-web,2017-05-01,2692.547347,474056600.0,1801160000.0


### TTM (Sum): Method 0

In [295]:
# Calculates rolling averages based on TTM...but runs on.
# Fix by NaN-ing the first 11 entries? Will have problems for missing dates!
TTMsum_df = indexed_df.rolling(window=12, min_periods=12).sum()
TTMsum_df = TTMsum_df.sort_index()
TTMsum_df.loc[('Quizlet', 'C&CL', 'Quizlet', 'quizlet.com', 'mobile-web','2017-05-01'),:]

average_visit_duration    2.692547e+03
visits                    4.740566e+08
LT_mins                   1.801160e+09
Name: (Quizlet, C&CL, Quizlet, quizlet.com, mobile-web, 2017-05-01), dtype: float64

In [296]:
# Checks the TTM average function (should be same as cell above)
indexed_df.loc[('Quizlet',slice(None),slice(None),slice(None),'mobile-web',slice('2016-06-01','2017-05-01')),].sum()

average_visit_duration    2.692547e+03
visits                    4.740566e+08
LT_mins                   1.801160e+09
dtype: float64

### TTM (Sum): Method 1
Extending the 'proof of concept' for entire set of data.

In [297]:
# Drop 'endpoint_category' to column; get keys to iterate through
indexed_df3 = indexed_df2.reset_index(level=-1, inplace=False)
keys = list(set(indexed_df3.index.values))

# Turn off SettingWithCopyWarning
pd.options.mode.chained_assignment = None

# Do TTMsum calculations
for key in keys:
    site_info = indexed_df3.loc[key,:]
    site_info.set_index(['endpoint_category'], inplace=True)
    endpoint_categories = list(set(site_info.index.values))
    for endpoint_category in endpoint_categories:
        site_info.loc[endpoint_category] = site_info.loc[endpoint_category].rolling(window=12, min_periods=12).sum()
        
TTMsum_df = indexed_df3.reset_index()
TTMsum_df['average_visit_duration_mins'] = TTMsum_df['LT_mins'] / TTMsum_df['visits']
TTMsum_df.set_index(['date'], inplace=True)
TTMsum_df[(TTMavg_df['group_site'] == 'Quizlet') & (TTMavg_df['endpoint_category'] == 'mobile-web')]

Unnamed: 0_level_0,group_site,KA_initiative,site_name,site_url,endpoint_category,average_visit_duration,visits,LT_mins,average_visit_duration_mins
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-06-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,
2015-07-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,
2015-08-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,
2015-09-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,
2015-10-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,
2015-11-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,
2015-12-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,
2016-01-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,
2016-02-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,
2016-03-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,


In [298]:
# Checking to see that the first 11 months are devoid of data
x = TTMsum_df.sort_index().loc[:'2016-04-01',['visits', 'average_visit_duration', 'LT_mins']]
x.isnull().all()

visits                    True
average_visit_duration    True
LT_mins                   True
dtype: bool

In [299]:
# Checking to see comprehensiveness of records (by date)
date_check_df = df.groupby(['group_site', 'KA_initiative', 'site_name', 'site_url', 'endpoint_category']).count()
date_check_df['date'].describe()

count    297.0
mean      25.0
std        0.0
min       25.0
25%       25.0
50%       25.0
75%       25.0
max       25.0
Name: date, dtype: float64

### TTM (Average): Method 1

In [300]:
# Drop 'endpoint_category' to column; get keys to iterate through
indexed_df3 = indexed_df2.reset_index(level=-1, inplace=False)
keys = list(set(indexed_df3.index.values))

# Turn off SettingWithCopyWarning
pd.options.mode.chained_assignment = None

# Do TTMavg calculations
for key in keys:
    site_info = indexed_df3.loc[key,:]
    site_info.set_index(['endpoint_category'], inplace=True)
    endpoint_categories = list(set(site_info.index.values))
    for endpoint_category in endpoint_categories:
        site_info.loc[endpoint_category] = site_info.loc[endpoint_category].rolling(window=12, min_periods=12).mean()

TTMavg_df = indexed_df3.reset_index()
TTMavg_df['average_visit_duration_mins'] = TTMavg_df['LT_mins'] / TTMavg_df['visits']
TTMavg_df.set_index(['date'], inplace=True)
TTMavg_df[(TTMavg_df['group_site'] == 'Quizlet') & (TTMavg_df['endpoint_category'] == 'mobile-web')]

Unnamed: 0_level_0,group_site,KA_initiative,site_name,site_url,endpoint_category,average_visit_duration,visits,LT_mins,average_visit_duration_mins
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-06-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,
2015-07-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,
2015-08-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,
2015-09-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,
2015-10-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,
2015-11-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,
2015-12-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,
2016-01-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,
2016-02-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,
2016-03-01,Quizlet,C&CL,Quizlet,quizlet.com,mobile-web,,,,


In [301]:
# Checking to see that the first 11 months are devoid of data
y = TTMavg_df.sort_index().loc[:'2016-04-01',['visits', 'average_visit_duration', 'LT_mins']]
y.isnull().all()

visits                    True
average_visit_duration    True
LT_mins                   True
dtype: bool

## % Y/Y Growth (TTM, One Month)

In [302]:
# Convert 'date' field from string type to datetime type
df['date'] = pd.to_datetime(df['date'])

# Start with new, indexed dataframe
indexed_df = df.set_index(['group_site','KA_initiative','site_name','site_url','endpoint_category','date'], inplace=False)
indexed_df

# 

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,average_visit_duration,visits,LT_mins,norm_LT
group_site,KA_initiative,site_name,site_url,endpoint_category,date,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
100Marks,TP,100Marks,100marks.in,mobile-web,2015-06-01,173.083726,2.552191e+04,7.362378e+04,7.127497e-04
100Marks,TP,100Marks,100marks.in,mobile-web,2015-07-01,0.000000,0.000000e+00,0.000000e+00,0.000000e+00
100Marks,TP,100Marks,100marks.in,mobile-web,2015-08-01,16.891649,3.720373e+03,1.047387e+03,1.366630e-05
100Marks,TP,100Marks,100marks.in,mobile-web,2015-09-01,0.000000,0.000000e+00,0.000000e+00,0.000000e+00
100Marks,TP,100Marks,100marks.in,mobile-web,2015-10-01,0.000000,0.000000e+00,0.000000e+00,0.000000e+00
100Marks,TP,100Marks,100marks.in,mobile-web,2015-11-01,7.071567,9.431491e+02,1.111590e+02,4.975130e-07
100Marks,TP,100Marks,100marks.in,mobile-web,2015-12-01,5.398753,1.060558e+03,9.542817e+01,5.559072e-07
100Marks,TP,100Marks,100marks.in,mobile-web,2016-01-01,0.000000,0.000000e+00,0.000000e+00,0.000000e+00
100Marks,TP,100Marks,100marks.in,mobile-web,2016-02-01,0.000000,0.000000e+00,0.000000e+00,0.000000e+00
100Marks,TP,100Marks,100marks.in,mobile-web,2016-03-01,0.000000,0.000000e+00,0.000000e+00,0.000000e+00
