In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
import matplotlib.ticker as ticker
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
data = 'simpledata.csv'
df_data = pd.read_csv(data, sep = ',')
df_data = pd.DataFrame(df_data)

In [3]:
df_data.dtypes

id                   float64
type                  object
subtype               object
bedroomCount         float64
locality              object
postCode               int64
habitableSurface     float64
buildingCondition     object
facedeCount          float64
price                float64
dtype: object

In [4]:
df_data= pd.get_dummies(df_data, columns=['type', 'subtype', 'locality', 'buildingCondition', ], drop_first=True)
print(df_data.head(10))
print(df_data.dtypes)

           id  bedroomCount  postCode  habitableSurface  facedeCount  \
0  20661494.0           2.0      1040             100.0          1.0   
1  20647642.0           4.0      1040             270.0          2.0   
2  20644816.0           2.0      1040              87.0          2.0   
3  20659813.0           2.0      1040             104.0          2.0   
4  20633249.0           1.0      1040              71.0          2.0   
5  20639359.0           2.0      1040              90.0          2.0   
6  20634658.0           3.0      1040             220.0          2.0   
7  20634657.0           3.0      1040             220.0          2.0   
8  20634655.0           2.0      1040             187.0          2.0   
9  20651309.0           1.0      1040              93.0          2.0   

      price  type_HOUSE  subtype_DUPLEX  subtype_EXCEPTIONAL_PROPERTY  \
0  399000.0       False           False                         False   
1  895000.0        True           False                      

In [5]:
# handling data types
categorical_features = ['type', 'subtype', 'locality', 'buildingCondition']
numeric_features = ['id','bedroomCount', 'postCode', 'habitableSurface', 'facedeCount']

categorical_transformer = Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent', fill_value='missing')),  # Impute missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])  # Apply OneHotEncoder

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing value
    ('scaler', StandardScaler())])  # numeric standardisation

In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [7]:
# 1. Create model GradientBoostingRegressor

model_gbr = GradientBoostingRegressor(n_estimators=100,      
                                      learning_rate=0.1,    
                                      max_depth=4,           
                                      random_state=42)

# 2. Create Pipeline : 
pipeline = Pipeline(steps=[('regressor',model_gbr)])

# 3. Prepare data for pipeline : 
X = df_data.drop('price', axis=1)
y = df_data['price']

#save the columns model for futures predictions
model_columns = X.columns

# 4. Test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# 1. Train model GradientBoostingRegressor
pipeline.fit(X_train, y_train)

# 2. Prediction on xtest
y_pred_gbr = pipeline.predict(X_test)

# 3. Model evaluation
mse = mean_squared_error(y_test, y_pred_gbr)
MAE = mean_absolute_error(y_test, y_pred_gbr)
r2 = r2_score(y_test, y_pred_gbr)

print(f"MAE: {MAE:.2f}")
print(f"RMSE: {mse**0.5:.2f}")
print(f"R² score : {r2:.2f}")

MAE: 133192.72
RMSE: 301605.00
R² score : 0.63


In [9]:
import joblib
joblib.dump(model_gbr, 'gbr_modele.pkl')

['gbr_modele.pkl']