# 0) Librairies

In [19]:
import os
import tarfile
from six.moves import urllib

import pandas as pd

import matplotlib.pyplot as plt

import numpy as np
import hashlib
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer, StandardScaler, MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVC
from sklearn.svm import SVR

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold 
from sklearn.base import clone


# 1) Importation des données

In [20]:

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/" 
    # lien vers le dossier de téléchargement
    #
HOUSING_PATH = "datasets/housing" # chemin vers le dossier où sont stockés les fichiers
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz" # lien de téléchargement

def fetch_housing_data(housing_url = HOUSING_URL, housing_path = HOUSING_PATH): 
    if not os.path.isdir(housing_path): # Création du dossier housing_path Si non existence
        os.makedirs(housing_path) 
    tgz_path = os.path.join(housing_path, "housing.tgz") # Mise en mémoire du fichier tgz_path ?
    urllib.request.urlretrieve(housing_url, tgz_path) # Téléchargement du fichier tgz
    housing_tgz = tarfile.open(tgz_path) # Mise en mémoire de l'archive tgz
    housing_tgz.extractall(path=housing_path) # Décompression de l'archive dans housing_path
    housing_tgz.close()

#fetch_housing_data() # Téléchargement du fichier
    
def load_housing_data(housing_path=HOUSING_PATH, name = 'housing'):
    csv_path = os.path.join(housing_path, name + ".csv") # Mise en mémoire du fichier csv
    return pd.read_csv(csv_path) # Création de l'objet panda <DataFrame>

    # Chargement des données
housing = load_housing_data() # <class 'pandas.DataFrame'>

# 2) Création des jeux de test et d'entrainement

In [125]:
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(housing, housing["income_cat"]):
    # split.split() renvoie un générateur contenant les labels du jeu de donnée d'entrainement, et de celui de test
    #
    #      IN: X <array-like of shape> (Données d'entrainement)
    #          y <array-like of shape> (Label utilisé pour la stratification; étiquette en supervisé)
    #     OUT: train, test <tuple of np.ndarray> 
    #
    strat_train_set = housing.loc[train_index] # <pd.DataFrame> (Jeu d'entrainement)
    strat_test_set = housing.loc[test_index] # <pd.DataFrame> (Jeu de test)

# Suppression du label
for set in (strat_train_set, strat_test_set, housing): 
    set.drop(["income_cat"], axis=1, inplace=True)

# Copie des données
Train = strat_train_set#.drop('median_house_value', axis = 1) 
Train_labels = strat_train_set['median_house_value'].copy()

Test = strat_test_set.drop('median_house_value', axis = 1) 
Test_labels = strat_test_set['median_house_value'].copy()

# Liste des attributs
num_attr = list(Train.drop(['ocean_proximity', 'median_house_value'], axis = 1, inplace = False).columns)
cat_attr = ['ocean_proximity']
added_attr = ['rooms_per_household', 'population_per_household', 'bedrooms_per_rooms']
ocean_attr = Train['ocean_proximity'].value_counts().index.tolist()

total_attr = pd.Index(num_attr + added_attr + ocean_attr + ['median_house_value'])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)


# 3) Transformateurs sur mesure

In [126]:
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    
    def __init__(self, add_bedrooms_per_room = True): # Stock le paramètre. ni *args ni **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room 
    
    def fit(self, X, y = None): # Renvoie self
        return self
    
    def transform(self, X, y = None): # Renvoie X modifié
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix] 
        population_per_household = X[:, population_ix] / X[:, household_ix] 
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else: 
            return np.c_[X, rooms_per_household, population_per_household]
        
class DataFrameSelector(BaseEstimator, TransformerMixin):

    def __init__(self, label_selected):
        self.label_selected = label_selected  

    def fit(self, X, y = None):
        return self

    def transform(self, X): # Renvoie les données de X dont le label est sélectionné
        return X[self.label_selected]

# 4) Pipeline

In [130]:
num_pipe = Pipeline([('select', DataFrameSelector(num_attr)),
                     ('imputer', SimpleImputer(strategy = 'median')),
                     ('lab_add', CombinedAttributesAdder()),
                     ('standardisation', StandardScaler())])

cat_pipe = Pipeline([('select', DataFrameSelector(cat_attr)),
                     ('OneHot', OneHotEncoder(categories = [ocean_attr], handle_unknown = 'ignore')),
                     ('imputer', SimpleImputer(strategy = 'constant', fill_value = 0, copy = True))])

house_value_pipe = Pipeline([('select', DataFrameSelector(['median_house_value'])),
                             #('imputer', SimpleImputer(strategy = 'median')),
                             #('standardisation', StandardScaler())
                             ])

full_pipeline = FeatureUnion(transformer_list = 
                            [('numeric', num_pipe),
                            ('categor', cat_pipe),
                            ('house_value', house_value_pipe)
                            ])

full_pipeline_predict = FeatureUnion(transformer_list = 
                            [('numeric', num_pipe),
                            ('categoric', cat_pipe)])


# 5) Préparation des données

In [131]:
Train_prepared_scp = full_pipeline.fit_transform(Train) # <scipy.csr_matrix>
Test_prepared_scp = full_pipeline_predict.fit_transform(Test) # <scipy.csr_matrix>

Train_prepared = pd.DataFrame(Train_prepared_scp.todense(), columns = total_attr) 
X = Train_prepared.drop('median_house_value', axis = 1) # Données d'entrainement
y = Train_prepared['median_house_value'].copy() # Etiquettes pour l'entrainement

Test_prepared = pd.DataFrame(Test_prepared_scp.todense()) # Données de test

# 6) Choix du modèle

## a) Fonctions d'évaluation

In [36]:
def RMSE(x, y, titre = 'un modèle', p = 2, aff = True):
    if p == 0:
        score = max(x - y)
    else:
        somme = 0
        for i in range(len(x)):
            somme += abs(x[i] - y[i]) ** p
        score = (somme / len(x)) ** (1/p)
    if aff: print('\nScore pour', titre, ': ', score)
    return score

def crossValid(model, X, y, titre = 'un modèle', nbSplit = 10):
    brut_scores = cross_val_score(model, X, y, 
                                scoring = 'neg_mean_squared_error', cv = nbSplit)
    scores = np.sqrt(- brut_scores)  

    print('\n\nMoyenne des scores pour {}: {}\nEcart-type: {}\n'.format(titre, scores.mean(), scores.std()))

def crossValid(model, X, y, NBSPLITS = 10):

    skfolds = StratifiedKFold(n_splits=NBSPLITS)
    scoreRMSE = 0

    for train_index, test_index in skfolds.split(X, y): 
        clone_clf = clone(model)
        
        X_folds = X.iloc[train_index]
        y_folds = y.iloc[train_index]
        
        X_test = X.iloc[test_index] 
        y_test = y.iloc[test_index]
        
        clone_clf.fit(X.iloc[train_index], y.iloc[train_index])
        
        y_pred = clone_clf.predict(X.iloc[test_index])
        scoreRMSE += RMSE(y_pred, y.iloc[test_index])
        
    print(scoreRMSE / NBSPLITS)

## b) Déclaration des modèles et entrainement

In [37]:
lin_reg = LinearRegression()
tree_reg = DecisionTreeRegressor()
forest_reg = RandomForestRegressor()
svr_reg = SVR(kernel = 'poly', degree = 2, C = 100, epsilon = 0.9)

In [143]:
lin_reg.fit(X, y)
prediction = lin_reg.predict(Test_prepared)
RMSE(x = prediction, y = np.array(Test_labels))
for k in range(10):
    print(prediction[k], list(Test_labels)[k])


Score pour un modèle :  66975.77789893825
425823.8036928534 500001.0
294754.1003758279 162500.0
244271.88797501213 204600.0
194552.9929249805 159700.0
264550.73373103497 184000.0
220451.95071436686 151900.0
157819.46463151128 104900.0
371200.6556060973 500001.0
287998.178559611 367400.0
229402.0351829362 346500.0




In [48]:
print(type(prediction))
print(type(np.array(Test_labels)))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [49]:
RMSE(x = prediction, y = np.array(Test_labels))


Score pour un modèle :  66975.77789893825


66975.77789893825