In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Sep 25 14:52:28 2018

@author: Freddie Zhang
"""

import pandas as pd
import statsmodels.api as sm
import seaborn as sns

# load data and preview
df = pd.read_csv('Travel Pony Facebook.csv')
# df.info()

# 1. First, remove rows that have 0 amount spent. 
df_clean = df[(df[['Amount Spent (USD)']] != 0).all(axis=1)]
# df_clean.info()

# Next, graph a histogram of Amount Spent using Seaborn
sns.set_style('darkgrid')
sns.distplot(df_clean['Amount Spent (USD)'])

In [None]:
# 2. Build a multiple regression 
# where the outcome variable is Amount Spent and the predictor variables are: 
# Campaign Name,Reach,Frequency,Impressions,Clicks,Unique Clicks,Page Likes,Page Engagement,
# Post Engagement,Post Likes,Post Comments,Post Shares,Photo Views,Website Clicks
df_clean['Campaign Name'] = df_clean['Campaign Name'].astype('str') # set Campaign Name to strings
df_clean['Campaign Name'] = df_clean['Campaign Name'].str.replace('[^a-zA-Z0-9:\s+]', '') # remove _
df_clean['Campaign Name'] = df_clean['Campaign Name'].str.replace('\d+', '') # remove numbers
df_clean.loc[df_clean['Campaign Name'].str.contains('travelpony', case = False), 'Campaign Name'] = 'TravelPony'
df_clean.loc[df_clean['Campaign Name'].str.contains('post', case = False), 'Campaign Name'] = 'Post'
campaign_name = pd.get_dummies(df_clean['Campaign Name'])
df_clean = pd.concat([df_clean, campaign_name], axis=1)
# df_clean.info()

DV = df_clean['Amount Spent (USD)']
IV = df_clean[['Ad ','Cyber Weekend','Nanigans','Post','SimpleMultiFillCampaign','TravelPony','Reach','Frequency','Impressions','Clicks','Unique Clicks','Page Likes','Page Engagement','Post Engagement','Post Likes','Post Comments','Post Shares','Photo Views','Website Clicks']]
model = sm.OLS(DV, IV).fit()
predictions = model.predict(IV) # make the predictions by the model
model.summary()

# What are the three predictors with the highest predictive value (rank them, please). p-value < 0.05
ModelSummary = []
ModelSummary = pd.concat([model.params, model.pvalues, ],axis=1)
ModelSummary.columns = ['Coef', 'Pvalue']
ModelSummary = ModelSummary[ModelSummary['Pvalue'] <= 0.05].sort_values('Coef', ascending = False)
ModelSummary['Rank'] = ModelSummary['Coef'].rank(ascending=0).astype(int)
ModelSummary

# I removed variables with p-value > 0.05 since they are not significant. Therefore, the 
# three predictors with the highest predictive values are Page Likes(24.450740), 
# Post Enagementment(18.024142), Post Shares(4.933423).