In [1]:
import json
import yaml
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

### **Extract Data**

In [2]:
file_name = 'drug200.csv'

In [3]:
dataset = pd.read_csv(file_name)

In [4]:
dataset.isna().sum()

Age            0
Sex            0
BP             0
Cholesterol    0
Na_to_K        0
Drug           0
dtype: int64

In [5]:
dataset.shape

(200, 6)

In [6]:
dataset.columns

Index(['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K', 'Drug'], dtype='object')

In [7]:
dataset.head(20)

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY
5,22,F,NORMAL,HIGH,8.607,drugX
6,49,F,NORMAL,HIGH,16.275,DrugY
7,41,M,LOW,HIGH,11.037,drugC
8,60,M,NORMAL,HIGH,15.171,DrugY
9,43,M,LOW,NORMAL,19.368,DrugY


In [8]:
dataset['Drug'].value_counts()

DrugY    91
drugX    54
drugA    23
drugC    16
drugB    16
Name: Drug, dtype: int64

In [9]:
from sklearn.preprocessing import OrdinalEncoder

#Extract categorical columns from the dataframe
#Here we extract the columns with object datatype as they are the categorical columns
categorical_columns = dataset.select_dtypes(include=['object']).columns.tolist()

#Initialize OrdinalEncoder
ord_enc = OrdinalEncoder()

# Apply ordinal encoding to the categorical columns
ord_encoded = ord_enc.fit_transform(dataset[categorical_columns])

dataset[categorical_columns] = ord_encoded

In [10]:
dataset.head(20)

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,0.0,0.0,0.0,25.355,0.0
1,47,1.0,1.0,0.0,13.093,3.0
2,47,1.0,1.0,0.0,10.114,3.0
3,28,0.0,2.0,0.0,7.798,4.0
4,61,0.0,1.0,0.0,18.043,0.0
5,22,0.0,2.0,0.0,8.607,4.0
6,49,0.0,2.0,0.0,16.275,0.0
7,41,1.0,1.0,0.0,11.037,3.0
8,60,1.0,2.0,0.0,15.171,0.0
9,43,1.0,1.0,1.0,19.368,0.0


In [11]:
dataset.corr()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
Age,1.0,0.102027,0.054212,-0.068234,-0.063119,0.041856
Sex,0.102027,1.0,-0.007814,-0.008811,-0.125008,0.018239
BP,0.054212,-0.007814,1.0,-0.137552,-0.149312,0.419397
Cholesterol,-0.068234,-0.008811,-0.137552,1.0,0.01,0.048415
Na_to_K,-0.063119,-0.125008,-0.149312,0.01,1.0,-0.689051
Drug,0.041856,0.018239,0.419397,0.048415,-0.689051,1.0


In [12]:
target_col = dataset.columns[-1]
target_col

'Drug'

In [13]:
train_size = 0.8
train_points = int(train_size*len(dataset))
train_df = dataset.iloc[:train_points]
test_df = dataset.iloc[train_points:]

In [14]:
train_points = int(train_size*len(train_df)) #further split train dataset to train and calibration
cal_df = train_df.iloc[train_points:]
train_df = train_df.iloc[:train_points]

In [15]:
df_dict = {'train': train_df, 'calibration': cal_df, 'test': test_df}
for key, value in df_dict.items():
    print(f'{key} dataframe shape: {value.shape}')

train dataframe shape: (128, 6)
calibration dataframe shape: (32, 6)
test dataframe shape: (40, 6)


In [16]:
X_train = train_df.iloc[:, :-1]
y_train = train_df.loc[:, target_col]
X_cal = cal_df.iloc[:, :-1]
y_cal = cal_df.loc[:, target_col]
X_test = test_df.iloc[:, :-1]
y_test = test_df.loc[:, target_col]

### **Base model fitting (prior to fit mauq)**

In [17]:
from sklearn.ensemble import ExtraTreesClassifier

clf = ExtraTreesClassifier(n_estimators=10, random_state=42)
clf.fit(X_train, y_train)

In [18]:
y_train

0      0.0
1      3.0
2      3.0
3      4.0
4      0.0
      ... 
123    0.0
124    2.0
125    0.0
126    0.0
127    4.0
Name: Drug, Length: 128, dtype: float64

In [19]:
clf.score(X_test, y_test)

0.9

In [20]:
y_pred_cal_smx = clf.predict_proba(X_cal)
y_pred_test_smx = clf.predict_proba(X_test)
y_pred_cal_label = clf.predict(X_cal)
y_pred_test_label = clf.predict(X_test)
y_pred_cal_smx

array([[1. , 0. , 0. , 0. , 0. ],
       [0.1, 0. , 0. , 0. , 0.9],
       [1. , 0. , 0. , 0. , 0. ],
       [0.9, 0. , 0. , 0. , 0.1],
       [0.1, 0. , 0. , 0. , 0.9],
       [1. , 0. , 0. , 0. , 0. ],
       [0.8, 0.2, 0. , 0. , 0. ],
       [0. , 0. , 0.1, 0. , 0.9],
       [0.3, 0.2, 0.4, 0.1, 0. ],
       [0. , 1. , 0. , 0. , 0. ],
       [0. , 0.2, 0.7, 0. , 0.1],
       [0.2, 0. , 0. , 0. , 0.8],
       [0. , 0.7, 0.3, 0. , 0. ],
       [0.8, 0. , 0. , 0. , 0.2],
       [0. , 0.3, 0.6, 0. , 0.1],
       [0.7, 0. , 0.3, 0. , 0. ],
       [0.1, 0.6, 0.2, 0.1, 0. ],
       [0.1, 0. , 0. , 0. , 0.9],
       [0.2, 0. , 0.1, 0.1, 0.6],
       [0.2, 0.4, 0.4, 0. , 0. ],
       [0. , 0. , 0. , 0.2, 0.8],
       [0. , 0. , 0. , 0.9, 0.1],
       [0. , 0.6, 0.3, 0. , 0.1],
       [0.2, 0. , 0.7, 0.1, 0. ],
       [0. , 0. , 0. , 0. , 1. ],
       [0.2, 0. , 0. , 0. , 0.8],
       [0.3, 0. , 0. , 0. , 0.7],
       [0. , 0. , 0. , 1. , 0. ],
       [0. , 0.7, 0.2, 0. , 0.1],
       [0.9, 0

In [21]:
#data input calibration set
input = pd.concat([X_cal, pd.DataFrame(y_cal)], axis=1)
input['y_pred_prob'] = y_pred_cal_smx.tolist()
# input['y_pred_label'] = y_pred_cal_label
input

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug,y_pred_prob
128,47,1.0,1.0,1.0,33.542,0.0,"[1.0, 0.0, 0.0, 0.0, 0.0]"
129,32,0.0,2.0,0.0,7.477,4.0,"[0.1, 0.0, 0.0, 0.0, 0.9]"
130,70,0.0,2.0,0.0,20.489,0.0,"[1.0, 0.0, 0.0, 0.0, 0.0]"
131,52,1.0,1.0,1.0,32.922,0.0,"[0.9, 0.0, 0.0, 0.0, 0.1]"
132,49,1.0,1.0,1.0,13.598,4.0,"[0.1, 0.0, 0.0, 0.0, 0.9]"
133,24,1.0,2.0,0.0,25.786,0.0,"[1.0, 0.0, 0.0, 0.0, 0.0]"
134,42,0.0,0.0,0.0,21.036,0.0,"[0.8, 0.2, 0.0, 0.0, 0.0]"
135,74,1.0,1.0,1.0,11.939,4.0,"[0.0, 0.0, 0.1, 0.0, 0.9]"
136,55,0.0,0.0,0.0,10.977,2.0,"[0.3, 0.2, 0.4, 0.1, 0.0]"
137,35,0.0,0.0,0.0,12.894,1.0,"[0.0, 1.0, 0.0, 0.0, 0.0]"


In [22]:
#data input test set (optional)
input_test = pd.concat([X_test, pd.DataFrame(y_test)], axis=1)
input_test['y_pred_prob'] = y_pred_test_smx.tolist()
# input_test['y_pred_label'] = y_pred_test_label.tolist() 
input_test

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug,y_pred_prob
160,30,0.0,2.0,0.0,10.443,4.0,"[0.2, 0.0, 0.0, 0.1, 0.7]"
161,57,0.0,0.0,1.0,9.945,2.0,"[0.0, 0.2, 0.8, 0.0, 0.0]"
162,43,1.0,2.0,1.0,12.859,4.0,"[0.4, 0.0, 0.0, 0.0, 0.6]"
163,21,0.0,0.0,1.0,28.632,0.0,"[1.0, 0.0, 0.0, 0.0, 0.0]"
164,16,1.0,0.0,1.0,19.007,0.0,"[1.0, 0.0, 0.0, 0.0, 0.0]"
165,38,1.0,1.0,0.0,18.295,0.0,"[0.8, 0.0, 0.0, 0.1, 0.1]"
166,58,0.0,1.0,0.0,26.645,0.0,"[1.0, 0.0, 0.0, 0.0, 0.0]"
167,57,0.0,2.0,0.0,14.216,4.0,"[0.5, 0.0, 0.0, 0.0, 0.5]"
168,51,0.0,1.0,1.0,23.003,0.0,"[1.0, 0.0, 0.0, 0.0, 0.0]"
169,20,0.0,0.0,0.0,11.262,1.0,"[0.0, 0.9, 0.0, 0.0, 0.1]"


In [23]:
#data input train set (optional)
#train_df

### **MAUQ Quantify Uncertainty API**

In [24]:
with open('url.yaml', 'r') as file:
    url_dict = yaml.safe_load(file)

In [25]:
user_value = 0.9
output_type = 'data'  # 'data', 'estimate'

In [26]:
api_json = {
    'data': input.values.tolist(),
    'test': input_test.values.tolist(),
    'problem_type': 'classification', 
    'confidence_level': user_value,
    'output_type': output_type
}

In [27]:
# URL to our MAUQ AWS service
protocol = url_dict['protocol']  # protocol not sybil_protocol
host = url_dict['host']
port = url_dict['port']
endpoint = 'quantify-uncertainty'

url = '%s://%s:%s/%s' % (protocol, host, str(port), endpoint)

In [28]:
%%time
response = requests.post(url, json=api_json)
print(response)
print()

<Response [200]>

CPU times: user 38.4 ms, sys: 2.25 ms, total: 40.6 ms
Wall time: 130 ms


In [29]:
uncertainty_json_out = response.json()
uncertainty_cols = list(input_test.columns) + ['prediction_set']
uncertainty_df = pd.DataFrame(uncertainty_json_out['output'], columns=uncertainty_cols)

In [30]:
uncertainty_df #TODO: display pred_set in original labels format, get rid of y_pred_prob

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug,y_pred_prob,prediction_set
0,30,0.0,2.0,0.0,10.443,4.0,"[0.2, 0.0, 0.0, 0.1, 0.7]",[4.0]
1,57,0.0,0.0,1.0,9.945,2.0,"[0.0, 0.2, 0.8, 0.0, 0.0]",[2.0]
2,43,1.0,2.0,1.0,12.859,4.0,"[0.4, 0.0, 0.0, 0.0, 0.6]",[4.0]
3,21,0.0,0.0,1.0,28.632,0.0,"[1.0, 0.0, 0.0, 0.0, 0.0]",[0.0]
4,16,1.0,0.0,1.0,19.007,0.0,"[1.0, 0.0, 0.0, 0.0, 0.0]",[0.0]
5,38,1.0,1.0,0.0,18.295,0.0,"[0.8, 0.0, 0.0, 0.1, 0.1]",[0.0]
6,58,0.0,1.0,0.0,26.645,0.0,"[1.0, 0.0, 0.0, 0.0, 0.0]",[0.0]
7,57,0.0,2.0,0.0,14.216,4.0,"[0.5, 0.0, 0.0, 0.0, 0.5]",[]
8,51,0.0,1.0,1.0,23.003,0.0,"[1.0, 0.0, 0.0, 0.0, 0.0]",[0.0]
9,20,0.0,0.0,0.0,11.262,1.0,"[0.0, 0.9, 0.0, 0.0, 0.1]",[1.0]


In [31]:
label_map = {'DrugY':0.0, 'drugA': 1.0, 'drugB': 2.0, 'drugC': 3.0, 'drugX': 4.0}
reverse_map = {v: k for k, v in label_map.items()}
# Replace the values in the lists
uncertainty_df['Drug'] = uncertainty_df['Drug'].map(reverse_map)
uncertainty_df['prediction_set'] = uncertainty_df['prediction_set'].apply(lambda lst: [reverse_map[val] for val in lst if val in reverse_map])

uncertainty_df.drop(columns=['y_pred_prob'], inplace=True)

In [32]:
uncertainty_df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug,prediction_set
0,30,0.0,2.0,0.0,10.443,drugX,[drugX]
1,57,0.0,0.0,1.0,9.945,drugB,[drugB]
2,43,1.0,2.0,1.0,12.859,drugX,[drugX]
3,21,0.0,0.0,1.0,28.632,DrugY,[DrugY]
4,16,1.0,0.0,1.0,19.007,DrugY,[DrugY]
5,38,1.0,1.0,0.0,18.295,DrugY,[DrugY]
6,58,0.0,1.0,0.0,26.645,DrugY,[DrugY]
7,57,0.0,2.0,0.0,14.216,drugX,[]
8,51,0.0,1.0,1.0,23.003,DrugY,[DrugY]
9,20,0.0,0.0,0.0,11.262,drugA,[drugA]


In [33]:
uncertainty_df['prediction_set'].value_counts()

[DrugY]    21
[drugX]     7
[drugB]     3
[]          3
[drugA]     3
[drugC]     3
Name: prediction_set, dtype: int64