### Packages and Data

In [89]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import sklearn
from sklearn.metrics import confusion_matrix, classification_report
import json
import re

In [42]:
rev = pd.read_csv("InterviewData_Cost.csv")
cost = pd.read_csv("InterviewData_Rev.csv")
df = pd.read_csv("InterviewData_Activity.csv")

### Question 1

In [73]:
merged = cost.merge(rev, how = "outer", on = ["date", "source_id"])

In [20]:
merged.head()

Unnamed: 0,date,source_id,revenue,cost
0,8/1/14,PA0368,5717.0,
1,1/31/14,PA0277,1380.0,
2,6/9/14,PA0745,7535.0,588.0
3,9/1/14,PA0751,2868.0,3736.0
4,3/12/14,PA0859,10757.0,1391.0


### Question 2

In [74]:
no_rev = merged[(merged["cost"].notnull() & merged["revenue"].isnull())]

In [75]:
no_rev.head()

Unnamed: 0,date,source_id,revenue,cost
1968,4/15/14,PA0169,,8511.0
2445,2/13/14,PA0293,,8242.0
2903,8/20/14,PA0467,,473.0
3118,9/30/14,PA0973,,5178.0
3425,1/23/14,PA0308,,7991.0


### Question 3

In [76]:
merged.groupby("source_id")["revenue"].sum().sort_values(ascending = False).head(4)

source_id
PA0527    1385747.0
PA0308    1338615.0
PA0352    1309685.0
PA0552    1283190.0
Name: revenue, dtype: float64

To visualize, I would use a sns.countplot and add percent to total. We could dig further into the dates by creating bins and adding dates to the hue parameter. 

### Question 4

In [53]:
dummy_genders = pd.get_dummies(df['gender'], prefix = 'gender')
dummy_metro = pd.get_dummies(df['metropolitan_area'], prefix = 'metro_area')
dummy_device = pd.get_dummies(df['device_type'], prefix = 'device')
cols_to_keep = ['active', 'age']
activity_data = df[cols_to_keep].join(dummy_genders.loc[:, 'gender_M':])
activity_data = activity_data.join(dummy_metro.loc[:, 'metro_area_Birmingham':])
activity_data = activity_data.join(dummy_device.loc[:, 'device_Mobile':])
activity_data = sm.add_constant(activity_data, prepend=False)
explanatory_cols = activity_data.columns[1:]
full_logit_model = sm.GLM(activity_data['active'], activity_data[explanatory_cols], family=sm.families.Binomial())
result = full_logit_model.fit()

In [54]:
result.summary(activity_data)

0,1,2,3
Dep. Variable:,active,No. Observations:,5420.0
Model:,GLM,Df Residuals:,5408.0
Model Family:,Binomial,Df Model:,11.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-3613.1
Date:,"Sat, 14 Nov 2020",Deviance:,7226.3
Time:,14:47:42,Pearson chi2:,5380.0
No. Iterations:,22,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
age,0.0136,0.003,5.180,0.000,0.008,0.019
gender_M,-0.6103,0.083,-7.343,0.000,-0.773,-0.447
metro_area_Birmingham,-0.0547,0.095,-0.576,0.564,-0.241,0.131
metro_area_Charlotte,-1.8619,0.337,-5.529,0.000,-2.522,-1.202
metro_area_Detroit,-0.0792,0.115,-0.689,0.491,-0.304,0.146
metro_area_Houston,-0.4496,0.093,-4.850,0.000,-0.631,-0.268
metro_area_Mobile,-1.7244,0.259,-6.655,0.000,-2.232,-1.217
metro_area_Nashville,22.4506,1.35e+04,0.002,0.999,-2.64e+04,2.64e+04
metro_area_Tampa,0.1370,0.104,1.312,0.189,-0.068,0.342


In [82]:
predictions = result.predict(activity_data[explanatory_cols])
prediction_results = [ 1 if x < 0.5 else 0 for x in predictions]
print("Confusion Matrix")
print(metrics.confusion_matrix(activity_data["active"], prediction_results))
print(classification_report(activity_data["active"], prediction_results, digits = 3))

Confusion Matrix
[[1164 1626]
 [1521 1109]]
              precision    recall  f1-score   support

           0      0.434     0.417     0.425      2790
           1      0.405     0.422     0.413      2630

    accuracy                          0.419      5420
   macro avg      0.420     0.419     0.419      5420
weighted avg      0.420     0.419     0.419      5420



### Answer
---
Accuracy of model is ~42%

### Question 5

In [83]:
training_data = activity_data[1:4000]
test_data = activity_data[4001:].copy()
training_logit_model = sm.GLM(training_data['active'], training_data[explanatory_cols], family=sm.families.Binomial())

training_result = training_logit_model.fit()

In [85]:
training_result.summary()

0,1,2,3
Dep. Variable:,active,No. Observations:,3999.0
Model:,GLM,Df Residuals:,3987.0
Model Family:,Binomial,Df Model:,11.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-2554.6
Date:,"Sat, 14 Nov 2020",Deviance:,5109.3
Time:,16:44:06,Pearson chi2:,3970.0
No. Iterations:,22,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
age,0.0071,0.003,2.227,0.026,0.001,0.013
gender_M,-0.5802,0.097,-5.981,0.000,-0.770,-0.390
metro_area_Birmingham,-0.1187,0.115,-1.037,0.300,-0.343,0.106
metro_area_Charlotte,-1.7834,0.382,-4.663,0.000,-2.533,-1.034
metro_area_Detroit,-0.1390,0.138,-1.007,0.314,-0.409,0.132
metro_area_Houston,-0.4865,0.112,-4.358,0.000,-0.705,-0.268
metro_area_Mobile,-1.7606,0.284,-6.202,0.000,-2.317,-1.204
metro_area_Nashville,21.8608,1.33e+04,0.002,0.999,-2.6e+04,2.6e+04
metro_area_Tampa,0.1892,0.127,1.484,0.138,-0.061,0.439


In [84]:
predictions2 = training_result.predict(test_data[explanatory_cols])
prediction_results2 = [ 1 if x < 0.5 else 0 for x in predictions2]
print("Confusion Matrix")
print(metrics.confusion_matrix(test_data["active"], prediction_results2))
print(classification_report(test_data["active"], prediction_results2, digits = 3))

Confusion Matrix
[[1112  178]
 [ 121    8]]
              precision    recall  f1-score   support

           0      0.902     0.862     0.881      1290
           1      0.043     0.062     0.051       129

    accuracy                          0.789      1419
   macro avg      0.472     0.462     0.466      1419
weighted avg      0.824     0.789     0.806      1419



### Answer
---
It seems as if the test data was heavily skewed towards being Non-Active(see confusion matrices), therefor heavily skewing the model created with the training data(since so many Non-Active values were omitted). 

### Question 6

In [265]:
data = pd.read_csv("InterviewData_Parsing.csv")

In [266]:
def parse(element):
    element = element.split(";")
    element[0] = element[0].replace("value", "")
    for i in range(4):
        element[i] = re.sub('[\W_]+', '', element[i])
    return element
data["data_to_parse"] = data["data_to_parse"].apply(parse)
for i in range(4):
    data[str(i)] = data["data_to_parse"][0][i]

In [271]:
data["data_to_parse"] = data["data_to_parse"].apply(parse)
for i in range(4):
    data[str(i)] = data["data_to_parse"][0][i]
data

Unnamed: 0,userid,data_to_parse,0,1,2,3
0,54f3ad9a29ada,"[N, U, A7, W]",N,U,A7,W
1,54f69f2de6aec,"[N, U, I6, W]",N,U,A7,W
2,54f650f004474,"[Y, U, A7, W]",N,U,A7,W
3,54f52e8872227,"[N, U, I1, W]",N,U,A7,W
4,54f64d3075b72,"[Y, U, A7, W]",N,U,A7,W
...,...,...,...,...,...,...
948,54f5eb32d1a5b,"[N, U, A1, W]",N,U,A7,W
949,54f34bd1a812a,"[N, C, A2, L]",N,U,A7,W
950,54f34aa1e1f00,"[Y, U, A1, W]",N,U,A7,W
951,54f47d97846bc,"[N, U, I4, L]",N,U,A7,W


### Additional Question (A)

> Goal: To compare effectiveness between two product trees on our website. 

> Recommended Test: Conduct an A/B test between the two versions for 4 weeks. 

> Metrics and Evaluation: Newer version is successful if we see >10% increase in revenue with an alpha of .05 or greater than 95% confidence.

> Considerations: If product/website changes are frequent, and we don't already have a conversion rate optimization tool, I would recommend trialing/investing in one. If we don't have the time or resources, I can calculate the frequency tables, find the difference in conversion rates, and calculate the P-value. 