In [1]:
from profk import Dataset

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
import itertools

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt

In [3]:
normal = {'value': lambda i: int(i) -3}

In [4]:
cable = pd.read_csv('./Data/training.csv', na_values=(-999, 6), converters=normal)

In [5]:
Cable = Dataset(cable)

In [6]:
features_2 = ['c', 'age2', 'age3', 'age4', 'age5', 'age6', 'ab', 'c1', 'c2', 'children', 'council', 'no_dish',
       'n_videos', 'price', 'price_mc', 'rent', 'sat_have', 'tv-satis', 'value', 'vcr_have', 'bbc1', 'bbc2', 'itv', 'ch4']

target_name = ['buy']

In [7]:
Model = LogisticRegression()
X, y = Cable.getXy(features_2, target_name)
Model.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [8]:
def getSimulationRange(x, dx):
    
    # Accepts a series and a step: returns an ordered list ranging from the minimum to the maximum of the list
    # in the series, seperated by steps
    
    min_x = min(x)
    max_x = max(x)
    
    r = np.arange(min_x, max_x + dx, dx)
    
    return r

In [9]:
def GenerateInputs(row, simulations):

    X = [np.insert(row, 16, i) for i in simulations]
    
    return X

In [10]:
X = Cable.data[features_2]

# For a single prior:

In [11]:
# Get simulated dataset as a method of Dataset?

prior = ['price'] # variable we have a strong prior about

# prior = np.array([]) # k x r?  this times a gradient should return a vector of booleans (True=prior match, False=violation)

simulation_space = X[prior] # extract features we wish to simulate

problem_space = X[[col for col in X.columns if col not in prior]] # all input vectors in sample (we don't have priors about)
problem_space = problem_space.dropna()
problem_space = problem_space.drop_duplicates().reset_index(drop=True) # unique input vectors in sample

In [12]:
sim_price = getSimulationRange(cable['price'], .1)

In [13]:
import warnings
warnings.filterwarnings('ignore')

In [14]:
# Allow simulated price to run over every input vector in our dataset

price_df = pd.DataFrame({'price': sim_price})
price_df['key'] = price_df.index
partials = []

for row in problem_space.iterrows():
    
    predictions = []
    
    for price in sim_price:
        
        x = np.array(row[1])
        x = np.insert(x, 13, price)
        predictions.append(Model.predict_proba(x)[0][1])
        
    dy_dx = [(y1-y0) for y1, y0 in zip(predictions[1:], predictions[:-1])]
    partials.append(dy_dx)
    
# This is very ugly and inefficient; there must be a way to create the cartesian product of dataframes without crafting keys 
# each time; maybe do all this in numpy under the mask of the Dataset class.  Also, this should be handled by generators;
# no need to hold everything in memory.

In [15]:
violations = [any(n > 0 for n in partials[0]) for n in partials]

In [16]:
problem_space['violations'] = violations

In [17]:
problem_space[problem_space['violations'] == True] # Everywhere dp(buy)/d(price) was observed positive

# -> nowhere

Unnamed: 0,c,age2,age3,age4,age5,age6,ab,c1,c2,children,...,rent,sat_have,tv-satis,value,vcr_have,bbc1,bbc2,itv,ch4,violations


In [18]:
# To explain estimate (not necessarily wrong) instance, find the x vector in the problem space that matches value you wish 
# to predict over and see if you had a prior violation in the relevant domain

# For multiple priors:

In [19]:
# Get simulated dataset as a method of dataset?

prior_set = ['price', 'price_mc'] # variables we have strong priors about

# priors = np.array([]) # k x r?  this times a gradient should return a vector of booleans (True=prior match, False=violation)

simulation_space = X[prior_set] # extract features we wish to simulate

problem_space = X[[col for col in X.columns if col not in prior_set]] # all input vectors in sample (we don't have priors about)
problem_space = problem_space.drop_duplicates().reset_index(drop=True) # unique input vectors in sample

# I think this simulation has factorial time level of complexity; this isn't a big deal I don't think.  If we have a large number
# of critical priors, why don't we just make a structural model?

In [20]:
simulations = [getSimulationRange(simulation_space[p], .1) for p in prior_set]

In [21]:
sim_price = getSimulationRange(cable['price'], .1)
sim_price_mc = getSimulationRange(cable['price_mc'], .1)
sim_value = getSimulationRange(cable['value'], .25)

In [22]:
k = np.array([i for i in itertools.product(sim_price, sim_price_mc)]) # Adjust dx if this is taking too long
k = pd.DataFrame({prior: k[:, p] for p, prior in enumerate(prior_set)})

In [23]:
len(sim_price) * len(sim_price_mc), len(k) # number of combinations is correct

# so our final serachable space will be len(problem space) x len(k) rows long (this will be computationally implausible
# for feature sets with continuous variables we don't have priors about)

(2911, 2911)

In [24]:
# There are len(problem space) feature combinations  we need to run our combinations over.  The searchable space should be
# len(k) X len(problem_space) long

In [25]:
search = [i for i in itertools.product(np.array(problem_space), np.array(k))]

In [26]:
len(problem_space) * len(k) ,len(search) # ok

(2747984, 2747984)

In [27]:
search[0]

(array([ 1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0., nan,  2.,  1.,
        nan,  0., -1., -2.,  0.,  1.,  0.,  0.,  0.]), array([8., 8.]))

In [28]:
# we can only find d(prediction)/d(prior_feature) once we specify a *single* variable to check the space of.

In [29]:
# Couldn't finish in time; some ugly combinatorial math to get through