In [14]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, Normalizer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import BaggingRegressor
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split



In [15]:
df = pd.read_csv("https://raw.githubusercontent.com/SamuelD005/challenge-regression/development/Data8.csv", sep=',')
del df["Unnamed: 0"]



In [5]:
df.columns

Index(['Locality', 'Type of property', 'Price', 'Number of rooms', 'Area',
       'Fully equipped kitchen', 'Furnished', 'Open fire', 'Terrace Area',
       'Garden Area', 'Surface of the land', 'Number of facades',
       'Swimming pool', 'State of the building', 'Province', 'Region',
       'PriceperMeter'],
      dtype='object')

In [6]:
house = df[df["Type of property"] == "house"]
house.drop(['PriceperMeter', 'Open fire'], inplace=True, axis=1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


### Separating the target from the data


In [7]:
house_x = house.drop("Price", axis=1)
house_y = house[['Price']]

### Separating the training and test dataset

In [8]:
X_train, X_test, y_train, y_test = train_test_split(house_x,house_y, test_size=0.2, random_state=41)

### Selecting numerical and categorical features

In [9]:
numerical_features = ['Number of rooms', 'Area', 'Terrace Area', 'Garden Area', 'Surface of the land']
categorical_features = ['Locality','Fully equipped kitchen', 'Furnished','Number of facades',
                       'Swimming pool', 'State of the building', 'Province', 'Region']


### Creating the pipelines to preprocess the data

In [19]:
num_pipeline = make_pipeline(SimpleImputer(missing_values=np.nan), StandardScaler(), PolynomialFeatures(degree=3))
cat_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent", missing_values=np.nan),OneHotEncoder(handle_unknown='ignore'))

In [20]:
preprocessor = make_column_transformer((num_pipeline, numerical_features), (cat_pipeline, categorical_features))
b = BaggingRegressor(base_estimator=XGBRegressor(n_estimators=150), n_estimators=5)

### Creating the model 


In [21]:
model = make_pipeline(preprocessor, b)
model.fit(X_train, y_train)
model.score(X_test, y_test)


  return f(*args, **kwargs)


0.7577116622531147

In [None]:
0.7577116622531147


### Metrics

In [25]:
y_pred = model.predict(X_test)

In [26]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score, r2_score
print('Mean squared error : ' + str(mean_squared_error(y_test,y_pred)))
print('Mean absolute error : ' + str(mean_absolute_error(y_test,y_pred)))
print('Explained vaiance score : ' + str(explained_variance_score(y_test,y_pred)))
print('r2 score : ' + str(r2_score(y_test,y_pred)))


Mean squared error : 49772195059.58447
Mean absolute error : 103700.24327161013
Explained vaiance score : 0.7577117690728272
r2 score : 0.7577116622531147
