# 'Applied' Exercise

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
cable = pd.read_csv('./Data/training.csv', na_values=(-999, 6)) # value = 6 corresponds to refusal to answer, 6 nowhere else in data

# Adjust the Feature Set

In [None]:
def CleanCableData(df):
    
    drop = ['YES', 'ID', 'age', 'class', 'tele_have', 'ab', 'c1', 'c2', 'd', 'de']
    df['value'] = [(i - 3) for i in df['value']] # Normalize (-2 to +2)
    df = df[[col for col in df.columns if col not in drop]]
    df = df.dropna()
    
    return df

In [None]:
cable = CleanCableData(cable)
# cable['constant'] = [1 for i in range(len(cable))]

In [None]:
y = pd.DataFrame(cable['buy'])
X = cable[[col for col in cable.columns if col != 'buy']]

# Model Estimation & Display

In [None]:
Model = LogisticRegression()

In [None]:
Model.fit(X, y)

In [None]:
coefficients = {}

for f in range(len(X.columns)):
    
    coefficients[X.columns[f]] = Model.coef_[0][f]

In [None]:
#coefficients

In [None]:
#sse = np.sum((Model.predict(X_arr) - y_arr) ** 2, axis=0) / float(X_arr.shape[0] - X_arr.shape[1])

#sse = sse.reshape(sse.shape[0], 1)

#se = np.diagonal(sse)

In [None]:
coefficients = Model.coef_[0]

In [None]:
var_cov_matrix = np.cov(X.T)

se = np.sqrt(np.diag(var_cov_matrix))

In [None]:
se

In [None]:
def tstat(estimate, se): return estimate / se

In [None]:
tstats = pd.Series(map(tstat, Model.coef_, se))[0]

In [None]:
np.diag(var_cov_matrix)

In [None]:
summary = pd.DataFrame({'Coefficients': coefficients, 'SE': se, 't-stat': tstats})

In [None]:
summary.set_index(X.columns)

In [None]:
# Simulating the model across the training set

predictions = Model.predict_proba(X)
predictions = [p[1] for p in predictions]

#X['predictions'] = list(predictions)

sum(predictions) / len(predictions), y.mean()[0] # predicted avg p, actual avg p for training set

In [None]:
# Showing a plot of the predictions in no particular order -> observe higher denisty in the 'no buy' space

fig = plt.figure(figsize=(40, 30))
ax1 = fig.add_subplot(111)

ax1.scatter(list(X.index), list(y["buy"]))
ax1.scatter(list(X.index), list(X['predictions']))

plt.show()

In [None]:
cable_holdout = pd.read_csv('./Data/holdout.csv', na_values=(-999, 6))

In [None]:
cable_holdout = CleanCableData(cable_holdout)
cable_holdout['constant'] = [1 for i in range(len(cable_holdout))]

In [None]:
y_2 = pd.DataFrame(cable_holdout['buy'])
X_2 = cable_holdout[[col for col in cable_holdout.columns if col != 'buy']]

In [None]:
predictions_2 = [p[1] for p in Model.predict_proba(X_2)]

sum(predictions_2) / len(predictions_2), y_2.mean()[0] # predicted avg p, actual avg p for holdout

In [None]:
LLH = -log_loss(y, predictions)

In [None]:
def log_likelihood(X, targets, coefs):
    
    scores = np.dot(X, coefs) # y hat
    
    ll = np.sum(targets*scores - np.log(1 + np.exp(scores)))
    
    return ll

In [None]:
ll = log_likelihood(X, y, Model.coef_.T)

In [None]:
LLH, ll

In [None]:
X.shape, y.shape, Model.coef_.shape

In [None]:
def log_likelihood(y, y_pred):
    
    running_likelihood = 0
    for obs in range(len(y)):
        
        term_1 = y[obs]*np.log(y_pred[obs])
        term_2 = (1-y[obs])*np.log(1-y_pred[obs])
        running_likelihood = running_likelihood + term_1 + term_2
        
    return running_likelihood
        

In [None]:
y = list(y['buy'])

In [None]:
log_likelihood(y, predictions)

# Elasticity Simulation

In [None]:
simulated_cable = cable.copy()

y = simulated_cable['buy']
X = simulated_cable[[col for col in simulated_cable.columns if col != 'buy']]

In [None]:
desired_values = range(8, 16)
desired_values

In [None]:
MARKET_SIZE = 1000000
outcomes = {}
for n in desired_values:
    
    X['price'] = [n for itme in X['price']]
    probas = [p[1] for p in Model.predict_proba(X)]
    average_proba = sum(probas) / len(probas)
    aggregate_demand = MARKET_SIZE * average_proba
    
    try:
        
        change_in_demand_as_percent = (aggregate_demand - last_value) / last_value
        change_in_price_as_percent = (n - last_n) / last_n
        elasticity = change_in_demand_as_percent / change_in_price_as_percent
        
    except(NameError):
        
        elasticity = 'n/a'
    
    last_n = n
    last_value = aggregate_demand
    
    print('Price: ' + str(n) + ', aggregate demand: ' + str(aggregate_demand) + ', elasticity: ' + str(elasticity))  

In [None]:
def DemandElasticity(D1, D2, P1, P2):
    
    return ((D2 - D1) / D1) / ((P2 - P1) / P1)

In [None]:
DemandElasticity(353872, 338742, 8, 9)

In [None]:
import itertools

In [None]:
def getSimulationRange(x, dx):
    
    # Accepts a series and a step: returns an ordered list ranging from the minimum to the maximum of the list
    # in the series, seperated by steps
    
    min_x = min(x)
    max_x = max(x)
    
    r = np.arange(min_x, max_x + dx, dx)
    
    return r

In [None]:
simulation = ['price'] # variables we have strong priors about
priors = np.array([]) # k x r?  this times a gradient should return a vector of booleans

sims = X[[col for col in simulation]] # extract features we wish to simulate

problem_space = X[[col for col in X.columns if col not in sims]] # all input vectors in sample
problem_space = problem_space.drop_duplicates().reset_index(drop=True) # unique input vectors in sample

In [None]:
problem_space.iloc[0:1]

In [None]:
for index, row in problem_space.iterrows():
    
    # cartesian product seems right
    row = np.array(row)
    m = itertools.product([row], sims['price'])
    # Model.predict_proba(m)
    tan = row
    
    break
    
# maybe: [gradient_f(Model.predict_proba(M)) for M in itertools.product(?)]

In [None]:
price_sim = getSimulationRange(sims['price'], .1)
price_sim

In [None]:
ROWS = [r for r in problem_space.iterrows()]

In [None]:
I = [i for i in itertools.product([ROWS[0][1]], [price_sim])]

In [None]:
simmy = pd.DataFrame({'price': I[0][1]})
pspace = problem_space.iloc[0:1]
simmy['tmp'], pspace['tmp'] = 1, 1

In [None]:
simmy.merge(pspace)

In [None]:
it = itertools.product(itertools.product([1, 2, 3], ['a', 'b', 'c']), ['bingo', 'boingo'])

In [None]:
Hey = [i for i in itertools.product(np.array([[5, 6]]), np.array([[1, 2]]))]

In [None]:
def CartesianProduct(x, y):

    product = np.array(np.meshgrid(m1, m2))
    
    return product