In [None]:
python -m venv myenv
myenv\Scripts\activate
pip list

In [22]:
## -- main lib
import pandas as pd
## -- sklearn data processing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
## -- sklearn modeling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
## -- sklearn metrics
from sklearn.metrics import r2_score

In [23]:
# Définir une Fonction personnalisée pour extraire l'année à partir de la date
def extract_year(data):
 # Convertir le type des valeurs de la colonne data en type datetime
 data = pd.to_datetime(data)
 # Extraire l'année de la date
 anne = data.dt.year
 # Transformer la colonne annee (de type Series) en un array numpy 2D
 return anne.values.reshape(-1, 1) 

# Instancier le transformateur personnalisé
year_extractor = FunctionTransformer(extract_year)


In [24]:
def impute_outliers(data, low, high, replacement):
    return data.apply(lambda x : replacement if x<low or x>high else x).values.reshape(-1,1)

outlier_imputer_num = ColumnTransformer(
transformers=[
('outlier_replacer_size_m2',
 FunctionTransformer(impute_outliers, kw_args={'low': 10, 'high': 500, 'replacement': 140}),
'size_m2'),
('outlier_replacer_num_bedrooms',
FunctionTransformer(impute_outliers, kw_args={'low': 0, 'high': 10, 'replacement': 3}),
'num_bedrooms'),
('outlier_replacer_num_bathrooms',
 FunctionTransformer(impute_outliers, kw_args={'low': 0, 'high': 5, 'replacement': 2}),
'num_bathrooms')
],
# Garde les colonnes restantes telles quelles
remainder='passthrough'
)

def to_dataframe_with_columns(data, columns):
    # Retourner un dataframe
    return pd.DataFrame(data, columns=columns)

In [25]:
preprocessor = ColumnTransformer(
transformers=[
('num', SimpleImputer(strategy='mean'),
['size_m2','num_bedrooms','num_bathrooms']),
('cat', OneHotEncoder(sparse_output=False), ['city']),
('date', year_extractor, 'date_built')
]
)

In [26]:
cols_num = ['size_m2','num_bedrooms','num_bathrooms']
preprocessor_num = Pipeline(
 steps=[
('nan_imputer_num', SimpleImputer(strategy='mean')), # Imputation des NaN
('to_dataframe', FunctionTransformer(to_dataframe_with_columns, kw_args={'columns': cols_num})),
('outlier_imputer_num', outlier_imputer_num)
])

In [27]:
preprocessor = ColumnTransformer(
transformers=[
('num', preprocessor_num, ['size_m2','num_bedrooms','num_bathrooms']),
('cat', OneHotEncoder(sparse_output=False), ['city']),
('date', year_extractor,'date_built')
]
)

In [28]:
pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('scaler', StandardScaler()),
('regressor', LinearRegression())
])

In [29]:
df = pd.read_csv('synthetic_housing_dataset.csv')
X = df[['size_m2', 'num_bedrooms','num_bathrooms','city', 'date_built']]
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X,y,
train_size=0.7,
random_state=45)

In [30]:
pipeline.fit(X_train, y_train)

In [31]:
y_train_pred = pipeline.predict(X_train)
y_test_pred = pipeline.predict(X_test)

In [32]:
print('R2 train :',r2_score(y_train, y_train_pred))
print('R2 test :',r2_score(y_test, y_test_pred))

R2 train : 0.9850531976936413
R2 test : 0.9745114193051982
