# 'Applied' Exercise

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt

In [3]:
cable = pd.read_csv('./Data/training.csv', na_values=(-999, 6)) # value = 6 corresponds to refusal to answer, 6 nowhere else in data

# Adjust the Feature Set

In [4]:
def CleanCableData(df):
    
    # omitting irrelevant/redundant columns and singular dummies (age=1, class=poor(d and e))
    
    drop = ['YES', 'ID', 'age', 'class', 'tele_have', 'd', 'de'] 
    df['value'] = [(i - 3) for i in df['value']] # Normalize (-2 to +2)
    df = df[[col for col in df.columns if col not in drop]]
    df = df.dropna()
    
    return df

In [5]:
cable = CleanCableData(cable)

In [6]:
y = pd.DataFrame(cable['buy'])
X = cable[[col for col in cable.columns if col != 'buy']]

feature_names = X.columns
X = np.array(X)

# cable['constant'] = [1 for i in range(len(cable))]

In [7]:
# add a constant column

X_c = np.c_[np.ones(X.shape[0]), X]
feature_names = feature_names.insert(0, 'constant')

# Model Estimation & Display

In [8]:
# Extend the sklearn LogisticRegression class to return Standard Errors and T-statistics

class ExtLogisticRegression(LogisticRegression):
    
    def __init__(self):
        
        LogisticRegression.__init__(self)
            
    def tstat(self, estimate, se): return estimate / se
    
    def getStats(self, X, feature_names): # passing X again after fit() is ugly, but unavoidable without serious class amendment.
        
        '''Courtesy of: 
        # https://stats.stackexchange.com/questions/89484/how-to-compute-the-standard-errors-of-a-logistic-regressions-coefficients
        # The covariance matrix as given by: (X'VX)^-1

        # X: (n x k), X': (k x n)
        # V: (n x n)

        # -> X'VX: (k x k)'''

        n = len(X)
        predictions = np.matrix(self.predict_proba(X))

        # Initiate matrix of 0's, fill diagonal with each predicted observation's variance
        V = np.matrix(np.zeros(shape = (n, n)))

        p_no = predictions[:,0] # array of all p(no buy)
        p_yes = (predictions[:,1]).A1 # flattened array of all p(buy)

        np.fill_diagonal(V, np.multiply(p_no, p_yes)) # n X n 

        # Covariance matrix
        cov = np.linalg.inv(X.T * V * X)

        # Standard errors
        se = np.sqrt(np.diag(cov))
        
        # Check with E-Views output -> ~ok
        
        tstats = pd.Series(map(self.tstat, self.coef_,se))[0]
        
        summary = pd.DataFrame({'Coefficients': list(self.coef_[0]), 'SE': list(se), 't-stat': list(tstats)})
        summary = summary.set_index(feature_names)
        
        self.summary = summary

In [9]:
Model = ExtLogisticRegression()

In [10]:
Model.fit(X_c, y)

  y = column_or_1d(y, warn=True)


ExtLogisticRegression()

In [11]:
Model.getStats(X_c, feature_names)

In [12]:
# Please see  for an Eviews comparison; though there may exist some minute differences in
# calculation (esp. for the constant term), we think our estimates are sufficiently to Eviews close to justify moving forward.

Model.summary

Unnamed: 0,Coefficients,SE,t-stat
constant,0.357277,0.300927,1.187253
age2,0.271506,0.113775,0.902231
age3,0.335084,0.115239,1.113506
age4,0.34248,0.114885,1.138083
age5,0.005747,0.132889,0.019098
age6,0.006027,0.129059,0.020028
ab,-2.624517,0.11474,-8.721436
c1,-2.19501,0.082904,-7.294155
c2,-0.828344,0.057754,-2.752639
children,0.276937,0.063324,0.920279


# Applied-Like Feature Selection

# Model Comments:

In [None]:
# Comments...

In [None]:
# Simulating the model across the training set

predictions = Model.predict_proba(X)
predictions = [p[1] for p in predictions]

#X['predictions'] = list(predictions)

sum(predictions) / len(predictions), y.mean()[0] # predicted avg p, actual avg p for training set

In [None]:
# Showing a plot of the predictions in no particular order -> observe higher denisty in the 'no buy' space

fig = plt.figure(figsize=(40, 30))
ax1 = fig.add_subplot(111)

ax1.scatter(list(X.index), list(y["buy"]))
ax1.scatter(list(X.index), list(X['predictions']))

plt.show()

In [None]:
cable_holdout = pd.read_csv('./Data/holdout.csv', na_values=(-999, 6))

In [None]:
cable_holdout = CleanCableData(cable_holdout)
cable_holdout['constant'] = [1 for i in range(len(cable_holdout))]

In [None]:
y_2 = pd.DataFrame(cable_holdout['buy'])
X_2 = cable_holdout[[col for col in cable_holdout.columns if col != 'buy']]

In [None]:
predictions_2 = [p[1] for p in Model.predict_proba(X_2)]

sum(predictions_2) / len(predictions_2), y_2.mean()[0] # predicted avg p, actual avg p for holdout

# Elasticity Simulation

In [None]:
simulated_cable = cable.copy()

y = simulated_cable['buy']
X = simulated_cable[[col for col in simulated_cable.columns if col != 'buy']]

In [None]:
desired_values = range(8, 16)
desired_values

In [None]:
MARKET_SIZE = 1000000
outcomes = {}
for n in desired_values:
    
    X['price'] = [n for itme in X['price']]
    probas = [p[1] for p in Model.predict_proba(X)]
    average_proba = sum(probas) / len(probas)
    aggregate_demand = MARKET_SIZE * average_proba
    
    try:
        
        change_in_demand_as_percent = (aggregate_demand - last_value) / last_value
        change_in_price_as_percent = (n - last_n) / last_n
        elasticity = change_in_demand_as_percent / change_in_price_as_percent
        
    except(NameError):
        
        elasticity = 'n/a'
    
    last_n = n
    last_value = aggregate_demand
    
    print('Price: ' + str(n) + ', aggregate demand: ' + str(aggregate_demand) + ', elasticity: ' + str(elasticity))  

In [None]:
def DemandElasticity(D1, D2, P1, P2):
    
    return ((D2 - D1) / D1) / ((P2 - P1) / P1)

In [None]:
def getSimulationRange(x, dx):
    
    # Accepts a series and a step: returns an ordered list ranging from the minimum to the maximum of the list
    # in the series, seperated by steps
    
    min_x = min(x)
    max_x = max(x)
    
    r = np.arange(min_x, max_x + dx, dx)
    
    return r