In [1]:
import json
import yaml
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

### **Extract Data**

In [2]:
file_name = 'Housing.csv'

In [3]:
dataset = pd.read_csv(file_name)

In [4]:
dataset.shape

(545, 13)

In [5]:
dataset.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [6]:
dataset.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [7]:
from sklearn.preprocessing import OneHotEncoder
#Extract categorical columns from the dataframe
#Here we extract the columns with object datatype as they are the categorical columns
categorical_columns = dataset.select_dtypes(include=['object']).columns.tolist()

#Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Apply one-hot encoding to the categorical columns
one_hot_encoded = encoder.fit_transform(dataset[categorical_columns])

#Create a DataFrame with the one-hot encoded columns
#We use get_feature_names_out() to get the column names for the encoded data
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

# Concatenate the one-hot encoded dataframe with the original dataframe
df_encoded = pd.concat([dataset, one_hot_df], axis=1)

# Drop the original categorical columns
dataset = df_encoded.drop(categorical_columns, axis=1)

In [8]:
dataset

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad_no,mainroad_yes,guestroom_no,guestroom_yes,...,basement_yes,hotwaterheating_no,hotwaterheating_yes,airconditioning_no,airconditioning_yes,prefarea_no,prefarea_yes,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,3,2,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
1,12250000,8960,4,4,4,3,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
2,12250000,9960,3,2,2,2,0.0,1.0,1.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,12215000,7500,4,2,2,3,0.0,1.0,1.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
4,11410000,7420,4,1,2,2,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,2,0.0,1.0,1.0,0.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
541,1767150,2400,3,1,1,0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
542,1750000,3620,2,1,1,0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
543,1750000,2910,3,1,1,0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


In [9]:
# time_col = dataset.columns[0]
# target_col = dataset.columns[-1]
# dataset[time_col] = dataset[time_col].astype(str)
target_col = dataset.columns[0]

In [10]:
target_col

'price'

In [11]:
train_size = 0.8
train_points = int(train_size*len(dataset))
train_df = dataset.iloc[:train_points]
test_df = dataset.iloc[train_points:]

In [12]:
train_points = int(train_size*len(train_df)) #further split train dataset to train and calibration
cal_df = train_df.iloc[train_points:]
train_df = train_df.iloc[:train_points]

In [13]:
df_dict = {'train': train_df, 'calibration': cal_df, 'test': test_df}
for key, value in df_dict.items():
    print(f'{key} dataframe shape: {value.shape}')

train dataframe shape: (348, 21)
calibration dataframe shape: (88, 21)
test dataframe shape: (109, 21)


In [14]:
# figsize = (16, 8)
# cal_df.set_index(time_col)[target_col].plot(figsize=figsize);

### **Base model fitting (prior to fit mauq)**

In [15]:
from sklearn.ensemble import RandomForestRegressor
# train_X = train_df.iloc[:, :-1]
# train_y = train_df.iloc[:, -1]
# cal_X = cal_df.iloc[:, :-1]
# cal_y = cal_df.iloc[:, -1]
# test_X = test_df.iloc[:, :-1]
# test_y = test_df.iloc[:, -1]
train_X = train_df.iloc[:, 1:]
train_y = train_df.iloc[:, 0]
cal_X = cal_df.iloc[:, 1:]
cal_y = cal_df.iloc[:, 0]
test_X = test_df.iloc[:, 1:]
test_y = test_df.iloc[:, 0]
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(train_X, train_y)

In [16]:
y_pred_cal = regr.predict(cal_X)
y_pred_test = regr.predict(test_X)

In [17]:
#data input calibration set
input_data_cal = pd.concat([cal_X, pd.DataFrame(cal_y)], axis=1)
input_data_cal['price_pred'] = y_pred_cal
input_data_cal

Unnamed: 0,area,bedrooms,bathrooms,stories,parking,mainroad_no,mainroad_yes,guestroom_no,guestroom_yes,basement_no,...,hotwaterheating_yes,airconditioning_no,airconditioning_yes,prefarea_no,prefarea_yes,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished,price,price_pred
348,3150,2,2,1,0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,3815000,5.175234e+06
349,4820,3,1,2,0,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,3780000,4.861512e+06
350,3420,2,1,2,1,0.0,1.0,1.0,0.0,1.0,...,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,3780000,4.983130e+06
351,3600,2,1,1,0,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,3780000,4.716080e+06
352,5830,2,1,1,2,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,3780000,5.138846e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431,3180,4,1,2,0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,3290000,4.809854e+06
432,6060,3,1,1,0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,3290000,5.730614e+06
433,3480,4,1,2,1,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,3290000,4.724132e+06
434,3792,4,1,2,0,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,3290000,4.730335e+06


In [18]:
#data input test set (optional)
input_data_test = pd.concat([test_X, pd.DataFrame(test_y)], axis=1)
input_data_test['price_pred'] = y_pred_test
input_data_test

Unnamed: 0,area,bedrooms,bathrooms,stories,parking,mainroad_no,mainroad_yes,guestroom_no,guestroom_yes,basement_no,...,hotwaterheating_yes,airconditioning_no,airconditioning_yes,prefarea_no,prefarea_yes,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished,price,price_pred
436,2145,3,1,2,0,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,3290000,4.726225e+06
437,5880,3,1,1,1,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,3290000,5.138846e+06
438,4500,2,1,1,0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,3255000,4.814553e+06
439,3930,2,1,1,0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,3255000,4.730335e+06
440,3640,4,1,2,0,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,3234000,4.719656e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,3000,2,1,1,2,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1820000,4.727707e+06
541,2400,3,1,1,0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1767150,4.716080e+06
542,3620,2,1,1,0,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1750000,4.716080e+06
543,2910,3,1,1,0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1750000,4.722650e+06


In [19]:
#data input train set (optional)
input_data_train = pd.concat([train_X, pd.DataFrame(train_y)], axis=1)
input_data_train

Unnamed: 0,area,bedrooms,bathrooms,stories,parking,mainroad_no,mainroad_yes,guestroom_no,guestroom_yes,basement_no,...,hotwaterheating_no,hotwaterheating_yes,airconditioning_no,airconditioning_yes,prefarea_no,prefarea_yes,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished,price
0,7420,4,2,3,2,0.0,1.0,1.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,13300000
1,8960,4,4,4,3,0.0,1.0,1.0,0.0,1.0,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,12250000
2,9960,3,2,2,2,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,12250000
3,7500,4,2,2,3,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,12215000
4,7420,4,1,2,2,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,11410000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343,4080,2,1,1,0,0.0,1.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,3850000
344,3850,2,1,1,0,0.0,1.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,3850000
345,2015,3,1,2,0,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,3850000
346,2176,2,1,2,0,0.0,1.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,3850000


### **MAUQ Quantify Uncertainty API**

In [20]:
with open('url.yaml', 'r') as file:
    url_dict = yaml.safe_load(file)

In [21]:
user_value = 0.9
output_type = 'data'  # 'data', 'estimate'

In [22]:
api_json = {
    'data': input_data_cal.values.tolist(),
    'test': input_data_test.values.tolist(),
    'confidence_level': user_value,
    'output_type': output_type
}

In [23]:
# URL to our MAUQ AWS service
protocol = url_dict['protocol']  # protocol not sybil_protocol
host = url_dict['host']
port = url_dict['port']
endpoint = 'quantify-uncertainty'

url = '%s://%s:%s/%s' % (protocol, host, str(port), endpoint)

In [24]:
%%time
response = requests.post(url, json=api_json)
print(response)
print()

<Response [200]>

CPU times: user 32.4 ms, sys: 1.37 ms, total: 33.8 ms
Wall time: 112 ms


In [25]:
uncertainty_json_out = response.json()
# uncertainty_json_out

In [26]:
uncertainty_cols = list(input_data_test.columns) + ['lower_interval', 'upper_interval']
uncertainty_df = pd.DataFrame(uncertainty_json_out['output'], columns=uncertainty_cols)

In [27]:
uncertainty_df

Unnamed: 0,area,bedrooms,bathrooms,stories,parking,mainroad_no,mainroad_yes,guestroom_no,guestroom_yes,basement_no,...,airconditioning_yes,prefarea_no,prefarea_yes,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished,price,price_pred,lower_interval,upper_interval
0,2145.0,3.0,1.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,3290000.0,4.726225e+06,2.748749e+06,6.703701e+06
1,5880.0,3.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,3290000.0,5.138846e+06,3.161371e+06,7.116322e+06
2,4500.0,2.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,3255000.0,4.814553e+06,2.837078e+06,6.792029e+06
3,3930.0,2.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,3255000.0,4.730335e+06,2.752860e+06,6.707811e+06
4,3640.0,4.0,1.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,3234000.0,4.719656e+06,2.742180e+06,6.697132e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104,3000.0,2.0,1.0,1.0,2.0,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1820000.0,4.727707e+06,2.750232e+06,6.705183e+06
105,2400.0,3.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1767150.0,4.716080e+06,2.738605e+06,6.693556e+06
106,3620.0,2.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1750000.0,4.716080e+06,2.738605e+06,6.693556e+06
107,2910.0,3.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1750000.0,4.722650e+06,2.745174e+06,6.700125e+06


In [28]:
price_cols = ['price', 'price_pred', 'lower_interval', 'upper_interval']

# suppress scientific notation by setting float_format
pd.options.display.float_format = '{:.0f}'.format

# display the dataframe without scientific notation
uncertainty_df[price_cols]

Unnamed: 0,price,price_pred,lower_interval,upper_interval
0,3290000,4726225,2748749,6703701
1,3290000,5138846,3161371,7116322
2,3255000,4814553,2837078,6792029
3,3255000,4730335,2752860,6707811
4,3234000,4719656,2742180,6697132
...,...,...,...,...
104,1820000,4727707,2750232,6705183
105,1767150,4716080,2738605,6693556
106,1750000,4716080,2738605,6693556
107,1750000,4722650,2745174,6700125
