In [3]:
import pickle
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, Normalizer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import BaggingRegressor
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split


In [4]:
df = pd.read_csv("https://raw.githubusercontent.com/SamuelD005/challenge-regression/development/Data8.csv", sep=',')
del df["Unnamed: 0"]

In [6]:
df['Province'].value_counts()

Flandre Occidental    12620
Flandre Oriental       7349
Hainaut                7081
Brussel                6972
Anvers                 6899
Liège                  5635
Brabant Flamand        4585
Brabant Wallon         2831
Limbourg               2754
Namur                  1602
Luxembourg              949
Name: Province, dtype: int64

In [4]:
x = df.drop(["Price", "PriceperMeter"], axis=1)
y = df[["Price"]]

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=41)

In [6]:
numerical_features = ['Locality','Number of rooms', 'Area', 'Terrace Area', 'Garden Area', 'Surface of the land']
categorical_features = ['Fully equipped kitchen', 'Furnished','Number of facades',
                       'Swimming pool', 'State of the building', 'Province', 'Region']

In [7]:
num_pipeline = make_pipeline(SimpleImputer(missing_values=np.nan), StandardScaler(), PolynomialFeatures(degree=3))
cat_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent", missing_values=np.nan),OneHotEncoder(handle_unknown='ignore'))

In [8]:
preprocessor = make_column_transformer((num_pipeline, numerical_features), (cat_pipeline, categorical_features))
b = BaggingRegressor(base_estimator=XGBRegressor(n_estimators=150), n_estimators=16)

In [9]:
model = make_pipeline(preprocessor, b)
model.fit(x_train, y_train)


  return f(*args, **kwargs)


Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer()),
                                                                  ('standardscaler',
                                                                   StandardScaler()),
                                                                  ('polynomialfeatures',
                                                                   PolynomialFeatures(degree=3))]),
                                                  ['Locality',
                                                   'Number of rooms', 'Area',
                                                   'Terrace Area',
                                                   'Garden Area',
                                                   'Surface of the land']),
                   

### metrics

In [10]:
model.score(x_test, y_test)

0.7574321295725

In [11]:
y_pred = model.predict(x_test)

In [12]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score, r2_score
print('Mean squared error : ' + str(mean_squared_error(y_test,y_pred)))
print('Mean absolute error : ' + str(mean_absolute_error(y_test,y_pred)))
print('Explained vaiance score : ' + str(explained_variance_score(y_test,y_pred)))
print('r2 score : ' + str(r2_score(y_test,y_pred)))

Mean squared error : 31022727615.411995
Mean absolute error : 86094.01882551557
Explained vaiance score : 0.7577142613146228
r2 score : 0.7574321295725


In [13]:
pickle_out = open("estimator.pkl", mode="wb")
pickle.dump(model, pickle_out)
pickle_out.close()

In [14]:
load_model = pickle.load(open("estimator.pkl", mode="rb"))

In [15]:
result = load_model.score(x_test, y_test)

In [16]:
result

0.7574321295725