# Regression Example based on Paris House Price Prediction

In [1]:
# Step -1: Importing all the required Modules
import pureml
import pandas as pd
from pureml.decorators import load_data

In [None]:
!pureml auth login

In [2]:
# Using Load Function From pureml.decorators
@load_data()
def load_data():
    df = pd.read_csv('data/train.csv') # change the path to your data location
    return df
data = load_data()
print(data)

          id  squareMeters  numberOfRooms  hasYard  hasPool  floors  cityCode  \
0          0         34291             24        1        0      47     35693   
1          1         95145             60        0        1      60     34773   
2          2         92661             45        1        1      62     45457   
3          3         97184             99        0        0      59     15113   
4          4         61752            100        0        0      57     64245   
...      ...           ...            ...      ...      ...     ...       ...   
22725  22725         55825             84        1        0      70     12031   
22726  22726         65870             88        1        0      49     23197   
22727  22727         93192             42        1        0      39      8539   
22728  22728         65797             86        1        0      89     23197   
22729  22729         82244             18        1        0      38     86728   

       cityPartRange  numPr

In [3]:
from sklearn.model_selection import train_test_split
from pureml.decorators import dataset

In [4]:
#Creating the version of the dataset.
@dataset(label='Regression:Example3',upload=True)
def create_data():
    df = load_data()
    features = ['squareMeters', 'numberOfRooms', 'hasYard', 'hasPool', 'cityPartRange', 'cityCode', 'floors',
                'numPrevOwners', 'made', 'isNewBuilt',
                'hasStormProtector', 'basement', 'attic', 'garage', 'hasStorageRoom', 'hasGuestRoom']
    y = df['price']
    x_train, x_test, y_train, y_test = train_test_split(df[features], df['price'], random_state=42)
    return {"x_train": x_train, "x_test": x_test, "y_train": y_train, "y_test": y_test}

In [5]:
create_data()

{'x_train':        squareMeters  numberOfRooms  hasYard  hasPool  cityPartRange  cityCode  \
 5848           7477             42        0        1              5     26153   
 4021          40214             71        1        0              7     98833   
 20121         57718             64        1        1              6     51111   
 17311         20658              6        0        0              1     80216   
 22111         61104             50        1        1              4     50452   
 ...             ...            ...      ...      ...            ...       ...   
 11964         40760             87        0        0              2      3812   
 21575         20075             20        1        1              5     14628   
 5390          72772             15        0        0              9     38639   
 860           32060             56        0        1              8     68761   
 15795         45070            100        1        0              8     32535   
 
   

In [6]:
df = pureml.dataset.fetch('Regression:Example3:v1')
x_test = df['x_test']
y_test = df['y_test']
x_train= df['x_train']
y_train = df['y_train']

In [7]:
x_test

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,cityPartRange,cityCode,floors,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom
1087,50673,9,1,0,3,88441,67,3,2015,0,1,3836,9274,192,0,6
6377,75848,26,1,1,9,22796,17,2,2017,0,0,6974,771,431,0,7
4153,89637,48,0,0,4,19556,80,8,2006,1,0,5481,2516,112,1,4
19452,71824,10,0,0,7,58542,50,1,1997,0,1,4901,3635,313,1,0
10043,99886,46,0,1,4,34373,32,9,2003,1,1,4516,7789,241,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12371,58657,69,1,1,2,16152,89,7,2003,0,1,7712,9572,974,1,2
7725,82525,38,0,0,6,5079,88,7,2008,1,1,4674,3909,946,1,4
10714,56885,80,1,1,1,75459,80,6,2020,1,1,729,478,155,0,7
5453,45179,78,0,1,9,58110,41,8,2015,1,1,168,6521,612,0,4


In [8]:
y_test

1087     5076210.5
6377     7591705.7
4153     8967748.6
19452    7190229.7
10043    9999687.3
           ...    
12371    5867038.3
7725     8258910.8
10714    5699848.1
5453     4519301.5
5230     1949922.4
Name: price, Length: 5683, dtype: float64

In [9]:
x_train

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,cityPartRange,cityCode,floors,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom
5848,7477,42,0,1,5,26153,77,7,2020,1,0,3233,4102,747,0,0
4021,40214,71,1,0,7,98833,69,8,2005,1,0,6957,3629,615,0,3
20121,57718,64,1,1,6,51111,9,1,1993,1,0,6434,5342,563,1,1
17311,20658,6,0,0,1,80216,48,2,1997,0,1,7165,1005,161,1,9
22111,61104,50,1,1,4,50452,78,4,1997,0,0,6834,467,133,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11964,40760,87,0,0,2,3812,34,9,2000,0,0,8787,422,307,0,4
21575,20075,20,1,1,5,14628,85,2,2010,0,0,5473,13,139,0,5
5390,72772,15,0,0,9,38639,26,3,2006,1,1,4304,8353,529,0,1
860,32060,56,0,1,8,68761,38,8,2007,1,0,4286,6867,805,1,8


In [10]:
y_train

5848      752083.1
4021     4028553.5
20121    5781612.3
17311    2076936.7
22111    6124108.7
           ...    
11964    4080383.1
21575    2010227.0
5390     7280658.6
860      3211741.7
15795    4510977.7
Name: price, Length: 17047, dtype: float64

In [11]:
from pureml.decorators import model

In [12]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np

In [13]:
# Registering the model.
df = pureml.dataset.fetch('Regression:Example3:v1')
x_test = df['x_test']
y_test = df['y_test']
y_train = df['y_train']
x_train = df['x_train']
@model(label='Regression_example_1_model:development2')
def train_model():
    MODEL_PARAMS = {
        'booster': 'gbtree',
        'learning_rate': 0.11,
        'n_estimators': 77,
        'objective': 'reg:squarederror',
        'gamma': 1,
        'max_depth': 4,
        'reg_lambda': 1,
        'reg_alpha': 1,
        'subsample': 0.85,
        'colsample_bytree': 1,
        'min_child_weight': 2,
        'seed': 42
    }
    xgbr = xgb.XGBRegressor(**MODEL_PARAMS)
    xgbr.fit(x_train, y_train)
    ypred2 = xgbr.predict(x_test)
    rmse = np.sqrt(mean_squared_error(y_test, ypred2))
    pureml.log(metrics={'RMSE': rmse})
    print(f"RMSE: {rmse}")
    return xgbr
train_model()

RMSE: 161026.14193028354


No params are found in config
No figures are found in config


In [14]:
pureml.model.fetch('Regression_example_1_model:development2:v1')

In [15]:
# Adding the Prediction
pureml.predict.add(label='Regression_example_1_model:development2:v1',paths={'predict':'predict.py'})

In [16]:
# Fetching the Prediction
pureml.predict.fetch(label='Regression_example_1_model:development2:v1')

In [17]:
# Evaluating the model
pureml.eval(task_type='regression',
            label_model='Regression_example_1_model:development2:v1',
            label_dataset='Regression:Example3:v1')

{'mse': 25929418384.95182, 'mae': 20012.502131219208}

In [None]:
# Using Fastapi to run the server
import pureml
pureml.fastapi.run(label='Regression_example_1_model:development1:v1')