In [1]:
import json
import yaml
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

### **Extract Data**

In [2]:
file_name = 'Titanic-Dataset.csv'

In [3]:
dataset = pd.read_csv(file_name)

In [4]:
dataset.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
dataset.shape

(891, 12)

In [6]:
dataset.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [7]:
dataset.set_index('PassengerId', inplace=True)
dataset.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:
dataset.drop(columns=['Name', 'Ticket', 'Cabin', 'Age'], inplace=True) #dropping unneeded variables (age because NaNs)

In [9]:
from sklearn.preprocessing import OrdinalEncoder

#Extract categorical columns from the dataframe
#Here we extract the columns with object datatype as they are the categorical columns
categorical_columns = dataset.select_dtypes(include=['object']).columns.tolist()

#Initialize OrdinalEncoder
ord_enc = OrdinalEncoder()

# Apply ordinal encoding to the categorical columns
ord_encoded = ord_enc.fit_transform(dataset[categorical_columns])

dataset[categorical_columns] = ord_encoded

In [10]:
dataset.dropna(inplace=True)

In [11]:
dataset.shape

(889, 7)

In [12]:
target_col = dataset.columns[0]
target_col

'Survived'

In [13]:
train_size = 0.8
train_points = int(train_size*len(dataset))
train_df = dataset.iloc[:train_points]
test_df = dataset.iloc[train_points:]

In [14]:
train_points = int(train_size*len(train_df)) #further split train dataset to train and calibration
cal_df = train_df.iloc[train_points:]
train_df = train_df.iloc[:train_points]

In [15]:
df_dict = {'train': train_df, 'calibration': cal_df, 'test': test_df}
for key, value in df_dict.items():
    print(f'{key} dataframe shape: {value.shape}')

train dataframe shape: (568, 7)
calibration dataframe shape: (143, 7)
test dataframe shape: (178, 7)


In [16]:
X_train = train_df.iloc[:, 1:]
y_train = train_df.loc[:, target_col]
X_cal = cal_df.iloc[:, 1:]
y_cal = cal_df.loc[:, target_col]
X_test = test_df.iloc[:, 1:]
y_test = test_df.loc[:, target_col]

In [17]:
X_train

Unnamed: 0_level_0,Pclass,Sex,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3,1.0,1,0,7.2500,2.0
2,1,0.0,1,0,71.2833,0.0
3,3,0.0,0,0,7.9250,2.0
4,1,0.0,1,0,53.1000,2.0
5,3,1.0,0,0,8.0500,2.0
...,...,...,...,...,...,...
565,3,0.0,0,0,8.0500,2.0
566,3,1.0,2,0,24.1500,2.0
567,3,1.0,0,0,7.8958,2.0
568,3,0.0,0,4,21.0750,2.0


### **Base model fitting (prior to fit mauq)**

In [18]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(X_train, y_train)

In [19]:
y_train

PassengerId
1      0
2      1
3      1
4      1
5      0
      ..
565    0
566    0
567    0
568    0
569    0
Name: Survived, Length: 568, dtype: int64

In [20]:
clf.score(X_test, y_test)

0.8146067415730337

In [21]:
y_pred_cal_smx = clf.predict_proba(X_cal)
y_pred_test_smx = clf.predict_proba(X_test)
y_pred_cal_label = clf.predict(X_cal)
y_pred_test_label = clf.predict(X_test)
y_pred_cal_smx

array([[0.88575428, 0.11424572],
       [0.7909665 , 0.2090335 ],
       [0.18871182, 0.81128818],
       [0.65208283, 0.34791717],
       [0.3195544 , 0.6804456 ],
       [0.88577638, 0.11422362],
       [0.88650242, 0.11349758],
       [0.21908685, 0.78091315],
       [0.15466889, 0.84533111],
       [0.33123361, 0.66876639],
       [0.88576227, 0.11423773],
       [0.24582578, 0.75417422],
       [0.10437475, 0.89562525],
       [0.79381123, 0.20618877],
       [0.56095555, 0.43904445],
       [0.8389658 , 0.1610342 ],
       [0.10481806, 0.89518194],
       [0.79179535, 0.20820465],
       [0.60340291, 0.39659709],
       [0.88577638, 0.11422362],
       [0.88577638, 0.11422362],
       [0.88567193, 0.11432807],
       [0.11185012, 0.88814988],
       [0.88568605, 0.11431395],
       [0.27215734, 0.72784266],
       [0.83104129, 0.16895871],
       [0.90000659, 0.09999341],
       [0.22292823, 0.77707177],
       [0.88486457, 0.11513543],
       [0.83874147, 0.16125853],
       [0.

In [22]:
#data input calibration set
input = pd.concat([X_cal, pd.DataFrame(y_cal)], axis=1)
input['y_pred_prob'] = y_pred_cal_smx.tolist()
# input['y_pred_label'] = y_pred_cal_label
input

Unnamed: 0_level_0,Pclass,Sex,SibSp,Parch,Fare,Embarked,Survived,y_pred_prob
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
570,3,1.0,0,0,7.8542,2.0,1,"[0.8857542796216131, 0.11424572037838686]"
571,2,1.0,0,0,10.5000,2.0,1,"[0.7909664966973815, 0.2090335033026185]"
572,1,0.0,2,0,51.4792,2.0,1,"[0.18871181522808167, 0.8112881847719183]"
573,1,1.0,0,0,26.3875,2.0,1,"[0.652082834229065, 0.34791716577093507]"
574,3,0.0,0,0,7.7500,1.0,1,"[0.31955440192257967, 0.6804455980774203]"
...,...,...,...,...,...,...,...,...
708,1,1.0,0,0,26.2875,2.0,1,"[0.652057523057296, 0.3479424769427039]"
709,1,0.0,0,0,151.5500,2.0,1,"[0.13744183874626748, 0.8625581612537325]"
710,3,1.0,1,1,15.2458,0.0,1,"[0.856788858593684, 0.14321114140631594]"
711,1,0.0,0,0,49.5042,0.0,1,"[0.08714202439502572, 0.9128579756049743]"


In [23]:
#data input test set (optional)
input_test = pd.concat([X_test, pd.DataFrame(y_test)], axis=1)
input_test['y_pred_prob'] = y_pred_test_smx.tolist()
# input_test['y_pred_label'] = y_pred_test_label.tolist() 
input_test

Unnamed: 0_level_0,Pclass,Sex,SibSp,Parch,Fare,Embarked,Survived,y_pred_prob
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
713,1,1.0,1,0,52.0000,2.0,1,"[0.7113096444622864, 0.2886903555377136]"
714,3,1.0,0,0,9.4833,2.0,0,"[0.8859380699703069, 0.11406193002969314]"
715,2,1.0,0,0,13.0000,2.0,0,"[0.7914272703832793, 0.20857272961672066]"
716,3,1.0,0,0,7.6500,2.0,0,"[0.8857312241844413, 0.11426877581555873]"
717,1,0.0,0,0,227.5250,0.0,1,"[0.10429077383409846, 0.8957092261659015]"
...,...,...,...,...,...,...,...,...
887,2,1.0,0,0,13.0000,2.0,0,"[0.7914272703832793, 0.20857272961672066]"
888,1,0.0,0,0,30.0000,2.0,1,"[0.12214115086618038, 0.8778588491338196]"
889,3,0.0,1,2,23.4500,2.0,0,"[0.37239141711679713, 0.6276085828832029]"
890,1,1.0,0,0,30.0000,0.0,1,"[0.558171651813739, 0.4418283481862611]"


In [24]:
#data input train set (optional)
#train_df

In [25]:
y_pred_cal_smx[np.arange(143), input[target_col].values]

array([0.11424572, 0.2090335 , 0.81128818, 0.34791717, 0.6804456 ,
       0.88577638, 0.88650242, 0.78091315, 0.84533111, 0.33123361,
       0.11423773, 0.75417422, 0.89562525, 0.79381123, 0.56095555,
       0.8389658 , 0.89518194, 0.79179535, 0.39659709, 0.88577638,
       0.88577638, 0.88567193, 0.88814988, 0.88568605, 0.27215734,
       0.83104129, 0.90000659, 0.77707177, 0.88486457, 0.83874147,
       0.37548425, 0.70669285, 0.88575898, 0.65612463, 0.88577638,
       0.44277778, 0.90901321, 0.88575898, 0.34687699, 0.83486635,
       0.86230502, 0.29837091, 0.88566346, 0.62297681, 0.8639774 ,
       0.88577638, 0.76779369, 0.8990234 , 0.4249985 , 0.7039102 ,
       0.7909665 , 0.87010985, 0.28856341, 0.14314327, 0.88575428,
       0.8866819 , 0.65358308, 0.75649875, 0.872005  , 0.88575898,
       0.86397521, 0.34700338, 0.88566393, 0.44169078, 0.64537441,
       0.49323452, 0.78091315, 0.88576227, 0.81447618, 0.2514952 ,
       0.90906395, 0.88575428, 0.91108505, 0.49323452, 0.10886

In [26]:
input['y_pred_prob'].values[0][1]
# [input.index, input[target_col].values]

0.11424572037838686

In [27]:
# [x[y] for x in input['y_pred_prob'].values for y in input[target_col].values]
cal_scores = 1 - np.array([input['y_pred_prob'].values[x][input[target_col].values[x]] for x in np.arange(input.shape[0])])
cal_scores

array([0.88575428, 0.7909665 , 0.18871182, 0.65208283, 0.3195544 ,
       0.11422362, 0.11349758, 0.21908685, 0.15466889, 0.66876639,
       0.88576227, 0.24582578, 0.10437475, 0.20618877, 0.43904445,
       0.1610342 , 0.10481806, 0.20820465, 0.60340291, 0.11422362,
       0.11422362, 0.11432807, 0.11185012, 0.11431395, 0.72784266,
       0.16895871, 0.09999341, 0.22292823, 0.11513543, 0.16125853,
       0.62451575, 0.29330715, 0.11424102, 0.34387537, 0.11422362,
       0.55722222, 0.09098679, 0.11424102, 0.65312301, 0.16513365,
       0.13769498, 0.70162909, 0.11433654, 0.37702319, 0.1360226 ,
       0.11422362, 0.23220631, 0.1009766 , 0.5750015 , 0.2960898 ,
       0.2090335 , 0.12989015, 0.71143659, 0.85685673, 0.11424572,
       0.1133181 , 0.34641692, 0.24350125, 0.127995  , 0.11424102,
       0.13602479, 0.65299662, 0.11433607, 0.55830922, 0.35462559,
       0.50676548, 0.21908685, 0.11423773, 0.18552382, 0.7485048 ,
       0.09093605, 0.11424572, 0.08891495, 0.50676548, 0.89113

In [28]:
q_hat = np.percentile(cal_scores, q=90)
q_hat

0.7122503362764854

In [29]:
values = input_test['y_pred_prob'].iloc[0]

In [30]:
bools = input_test['y_pred_prob'].iloc[0]>=(1-q_hat)

In [31]:
for i in zip(values, bools):
    print(i)

(0.7113096444622864, True)
(0.2886903555377136, True)


In [32]:
result = [value for value, boolean in zip(values, bools) if boolean]
result

[0.7113096444622864, 0.2886903555377136]

In [33]:
for i, j in input_test['y_pred_prob'].items():
    print(input_test['y_pred_prob'][i])

[0.7113096444622864, 0.2886903555377136]
[0.8859380699703069, 0.11406193002969314]
[0.7914272703832793, 0.20857272961672066]
[0.8857312241844413, 0.11426877581555873]
[0.10429077383409846, 0.8957092261659015]
[0.21861003751401653, 0.7813899624859835]
[0.8649903185616774, 0.1350096814383226]
[0.8857453379350458, 0.11425466206495417]
[0.20381401660522147, 0.7961859833947785]
[0.9082262321173683, 0.09177376788263165]
[0.7914272703832793, 0.20857272961672066]
[0.7914272703832793, 0.20857272961672066]
[0.7115615845372569, 0.2884384154627431]
[0.8858455019090667, 0.11415449809093331]
[0.3711804411959213, 0.6288195588040787]
[0.3195513696140212, 0.6804486303859788]
[0.8310412858838117, 0.1689587141161883]
[0.42277124277011124, 0.5772287572298888]
[0.145542318766674, 0.854457681233326]
[0.8404785888114745, 0.1595214111885255]
[0.7890230735706586, 0.21097692642934138]
[0.7914272703832793, 0.20857272961672066]
[0.7914272703832793, 0.20857272961672066]
[0.8866819009986863, 0.11331809900131373]
[0

### **MAUQ Quantify Uncertainty API**

In [34]:
with open('url.yaml', 'r') as file:
    url_dict = yaml.safe_load(file)

In [35]:
user_value = 0.9
output_type = 'data'  # 'data', 'estimate'

In [36]:
api_json = {
    'data': input.values.tolist(),
    'test': input_test.values.tolist(),
    'problem_type': 'classification', 
    'confidence_level': user_value,
    'output_type': output_type
}

In [37]:
# URL to our MAUQ AWS service
protocol = url_dict['protocol']  # protocol not sybil_protocol
host = url_dict['host']
port = url_dict['port']
endpoint = 'quantify-uncertainty'

url = '%s://%s:%s/%s' % (protocol, host, str(port), endpoint)

In [38]:
%%time
response = requests.post(url, json=api_json)
print(response)
print()

<Response [200]>

CPU times: user 38.3 ms, sys: 3.21 ms, total: 41.5 ms
Wall time: 224 ms


In [39]:
uncertainty_json_out = response.json()
uncertainty_cols = list(input_test.columns) + ['prediction_set']
uncertainty_df = pd.DataFrame(uncertainty_json_out['output'], columns=uncertainty_cols)

In [40]:
uncertainty_df #TODO: display pred_set in original labels format, get rid of y_pred_prob

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Embarked,Survived,y_pred_prob,prediction_set
0,1,1.0,1,0,52.0000,2.0,1,"[0.7113096444622864, 0.2886903555377136]","[0, 1]"
1,3,1.0,0,0,9.4833,2.0,0,"[0.8859380699703069, 0.11406193002969314]",[0]
2,2,1.0,0,0,13.0000,2.0,0,"[0.7914272703832793, 0.20857272961672066]",[0]
3,3,1.0,0,0,7.6500,2.0,0,"[0.8857312241844413, 0.11426877581555873]",[0]
4,1,0.0,0,0,227.5250,0.0,1,"[0.10429077383409846, 0.8957092261659015]",[1]
...,...,...,...,...,...,...,...,...,...
173,2,1.0,0,0,13.0000,2.0,0,"[0.7914272703832793, 0.20857272961672066]",[0]
174,1,0.0,0,0,30.0000,2.0,1,"[0.12214115086618038, 0.8778588491338196]",[1]
175,3,0.0,1,2,23.4500,2.0,0,"[0.37239141711679713, 0.6276085828832029]","[0, 1]"
176,1,1.0,0,0,30.0000,0.0,1,"[0.558171651813739, 0.4418283481862611]","[0, 1]"


In [41]:
uncertainty_df['prediction_set'].value_counts()

[0]       103
[0, 1]     38
[1]        37
Name: prediction_set, dtype: int64