In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, Normalizer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import BaggingRegressor
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split



In [19]:
df = pd.read_csv("https://raw.githubusercontent.com/SamuelD005/challenge-regression/development/Data8.csv", sep=',')
del df["Unnamed: 0"]

In [21]:
apparts = df[df["Type of property"] == "apartment"]
appart_x = apparts.drop(["Price", "PriceperMeter"], axis=1)
apparts_y = apparts["Price"]

In [22]:
X_train, X_test, y_train, y_test = train_test_split(appart_x, apparts_y, test_size=0.2, random_state=41)


In [25]:
numerical_features = ['Number of rooms', 'Area', 'Terrace Area', 'Garden Area', 'Surface of the land']
categorical_features = ['Locality','Fully equipped kitchen', 'Furnished','Number of facades',
                       'Swimming pool', 'State of the building', 'Province', 'Region']

In [26]:
num_pipeline = make_pipeline(SimpleImputer(missing_values=np.nan), StandardScaler(), PolynomialFeatures(degree=3))
cat_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent", missing_values=np.nan),OneHotEncoder(handle_unknown='ignore'))

In [33]:
preprocessor = make_column_transformer((num_pipeline, numerical_features), (cat_pipeline, categorical_features))
b = BaggingRegressor(base_estimator=XGBRegressor(n_estimators=150), n_estimators=16)

In [34]:
model = make_pipeline(preprocessor, b)
model.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer()),
                                                                  ('standardscaler',
                                                                   StandardScaler()),
                                                                  ('polynomialfeatures',
                                                                   PolynomialFeatures(degree=3))]),
                                                  ['Number of rooms', 'Area',
                                                   'Terrace Area',
                                                   'Garden Area',
                                                   'Surface of the land']),
                                                 ('pipeline-2',
                  

In [35]:
model.score(X_test, y_test)

0.7665853476003461

### Metrics

In [37]:
y_pred = model.predict(X_test)

In [39]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score, r2_score
print('Mean squared error : ' + str(mean_squared_error(y_test,y_pred)))
print('Mean absolute error : ' + str(mean_absolute_error(y_test,y_pred)))
print('Explained vaiance score : ' + str(explained_variance_score(y_test,y_pred)))
print('r2 score : ' + str(r2_score(y_test,y_pred)))


Mean squared error : 20238465594.252827
Mean absolute error : 65133.54737404409
Explained vaiance score : 0.7668364523911433
r2 score : 0.7665853476003461
