In [45]:
import sys, os
import numpy as np
sys.path.append('./overlap-code')

import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import patches
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree, datasets
from sklearn.neighbors import KernelDensity
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score, f1_score, balanced_accuracy_score, precision_recall_curve, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from overrule.overrule import OverRule2Stage, OverRule
from overrule.baselines import knn, marginal, propscore, svm
from overrule.support import SVMSupportEstimator, SupportEstimator
from overrule.overlap import SupportOverlapEstimator
from overrule.ruleset import BCSRulesetEstimator, RulesetEstimator


In [54]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

df = pd.read_csv('./data/fp_injectables_data.csv')

y = df['outcome'] * 1
a = df['treatment'] * 1
X = df[df.columns[:-2]]
X = X.apply(lambda x: x.fillna(x.median()),axis='rows')

encoding = pd.read_csv('./data/encoding.csv')

# Select and Encode ordinal features
v = encoding[encoding['encoding'] == 'O']['var_name'].values
enc = OrdinalEncoder()
ord_data = enc.fit_transform(X[v])
ord_features = v

# Select the discrete features
v = encoding[encoding['encoding'] == 'N']['var_name'].values
dis_data = X[v].values
dis_features = v

# Select and Encode nominal features
v = encoding[encoding['encoding'] == 'L']['var_name'].values
j = X[v].astype(int)
enc = OneHotEncoder(categories='auto')
nom_data = enc.fit_transform(j.astype(int))
nom_features = enc.get_feature_names(v)


# Combine all the features
X_arr = np.concatenate((ord_data, nom_data.toarray(), dis_data), axis=1)
features_names = np.concatenate((ord_features, nom_features, dis_features))

print(X_arr.shape)
X_df = pd.DataFrame(X_arr, columns=features_names)

(5649, 317)


In [55]:
SEED=0

CNF=False
VERBOSE=True

ALPHA_s=0.95
N_REF_MULT_s=1
ALPHA_o=0.95
N_REF_MULT_o=0

LAMBDA0=0.9
LAMBDA1=1e-3

D=20  # Maximum extra rules per beam seach iteration
K=20  # Maximum results returned during beam search
B=300  # Width of Beam Search

np.random.seed(SEED)
w_eps = 1e-8

In [49]:
# #Errors:
# If i specify cat columns, and don't one-hot encode, the fit method runs out of resources
# SolverError: Solver 'ECOS' failed. Try another solver or solve with verbose=True for more information. Try recentering the problem data around 0 and rescaling to reduce the dynamic range.

In [60]:
X_df_sample = X_df.sample(frac=0.3, axis=1)
a_sample = a.iloc[X_df_sample.index]

In [61]:
X_df_sample.shape

(5649, 95)

In [62]:
f_cols = X_df_sample.columns

base_estimator = LogisticRegression(solver='liblinear', max_iter=2000, C=0.005)
learner = CalibratedClassifierCV(base_estimator=base_estimator, cv=5, method='isotonic')

O = propscore.PropensityOverlapEstimator(estimator=learner)

RS_s = BCSRulesetEstimator(n_ref_multiplier=N_REF_MULT_s, alpha=ALPHA_s, lambda0=LAMBDA0, lambda1=LAMBDA1, B=B, CNF=CNF)
RS_o = BCSRulesetEstimator(n_ref_multiplier=N_REF_MULT_o, alpha=ALPHA_o, lambda0=LAMBDA0, lambda1=LAMBDA1, B=B, CNF=CNF)

M = OverRule2Stage(O, RS_o, RS_s)
M.fit(X_df_sample, a_sample)

rules = M.rules(as_str=True)

In [63]:
print('Number of reference samples: {}'.format(RS_s.refSamples.shape[0]))
print('Coverage of data points: %.3f, Requested >= %.3f' % (RS_s.predict(X_df_sample).mean(), RS_s.M.alpha))
print('Coverage of reference points: %.3f' % RS_s.predict(RS_s.refSamples).mean())
print('Rules: {}'.format(rules))
print('Complexity: {}'.format(RS_s.complexity()))
print('AUC between rules and base estimator: {}'.format(M.score_vs_base(X_df_sample)))

import time
outfile = open('metrics.txt', 'a+')


outfile.write('Time: {}\n'.format(time.time()))
outfile.write('Params: N_REF_MULT_s {}, N_REF_MULT_o {}, ALPHA_s {}, ALPHA_o {}, LAMBDA0 {}, LAMBDA1 {}, B {}, CNF {}\n'.
                      format(N_REF_MULT_s, N_REF_MULT_o, ALPHA_s, ALPHA_o, LAMBDA0, LAMBDA1, B, CNF))
outfile.write('Number of reference samples: {}\n'.format(RS_s.refSamples.shape[0]))
outfile.write('Coverage of data points (TPR): %.3f, Requested >= %.3f\n' % (RS_s.predict(X_df_sample).mean(), RS_s.M.alpha))
outfile.write('Coverage of reference points: (FPR) %.3f\n' % RS_s.predict(RS_s.refSamples).mean())
outfile.write('Rules: {}\n'.format(rules))
outfile.write('Complexity: {}\n'.format(RS_s.complexity()))
outfile.write('AUC between rules and base estimator: {}\n'.format(M.score_vs_base(X_df_sample)))

outfile.close()

Number of reference samples: 536655
Coverage of data points: 0.993, Requested >= 0.950
Coverage of reference points: 0.004
Rules: ('  (not v532_4 ∧ not v626_6 ∧ not v128_14 ∧ not v127_96 ∧ not v161_95 ∧ not v116_42 ∧ [v206 <= 2.000] ∧ not v312_7)', '  ([v138 <= 1.000])\n∨ ([v133 <= 10.000])\n∨ (v605_5)')
Complexity: (1, 8)
AUC between rules and base estimator: 0.7018562335617978


57

In [65]:
RS_S = OverRule(alpha_s = 0.1, alpha_r =0.05, n_ref_multiplier=1., ruleset_kwargs={'lambda0':0.9, 'lambda1':0, 'B':300})

In [67]:
O = np.where(a_sample > 0)[0]
N = np.where(a_sample == 0)[0]
U = np.where(a_sample < 0)[0]


nO = len(O)
nN = len(N)
nU = len(U)

In [70]:
nO, nN, nU

(1408, 4241, 0)

In [66]:
RS_S.fit(X_df_sample, a_sample)



AssertionError: 