In [200]:
import pandas as pd
import json
import matplotlib.pyplot as plt
# import sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted

# sklearn models
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [201]:
path = '../data/modified/trees_first.pkl'
trees= pd.read_pickle(path)

In [202]:
trees = trees.astype({'ADR_SECTEUR':'object'})

trees_train,trees_test = train_test_split(trees,random_state=2708)

# Transform data

## Preprocessing

In [236]:
# Transforme GEOJSON data in two features
class TransformGEOJSON(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    def fit(self, df):
        return self
    def transform(self, df):
        jsons = pd.json_normalize(df.GeoJSON.apply(json.loads))
        jsons['GeoJSON'] = df['GeoJSON']
        jsons[['longitude','latidude']] = pd.DataFrame(jsons['coordinates'].to_list(),columns=['longitude','latitude'])
        jsons = jsons.drop(['type','coordinates'],axis=1)
        df_new = df.merge(jsons,on='GeoJSON').drop('GeoJSON',axis=1)
        return df_new

# Drop features without info (1) or identifiers features
class DropuniqueCol(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    def fit(self, df):
        return self
    def transform(self, df):
        to_drop = ['ELEM_POINT_ID','CODE','NOM','GENRE','GENRE_DESC','CATEGORIE','CATEGORIE_DESC','BIEN_REFERENCE','SOUS_CATEGORIE','CODE_PARENT']
        df_new = df.drop(to_drop,axis=1)
        return df_new

# Drop Na features     
class DropNaWithSaveCol(BaseEstimator,TransformerMixin):
    def __init__(self,na_threshold=0.5):
        self.threshold = na_threshold
    def fit(self, df):
        na = df.isna().sum().div(len(df)).sort_values(ascending=False)
        self.col_na = na.where(na>=self.threshold).dropna().index
        return self
    def transform(self, df):
        check_is_fitted(self, ['col_na'])
        df_new= df.drop(self.col_na,axis=1)
        return df_new

preprocess_pipeline = Pipeline([("geo", TransformGEOJSON()),
                                ('dropuni',DropuniqueCol()),
                                ('na',DropNaWithSaveCol())])


## Feature add

In [204]:
# Create Bio features

class Bio(BaseEstimator,TransformerMixin):
    def __init__(self,to_use=True):
        self.to_use = to_use
    def fit(self, df,y=None):
        return self
    def transform(self,df):
        if self.to_use:
            df_new = df.copy()
            df_new['ESPECE'] = df_new['ESPECE'].map(str).replace('nan',' ')
            df_new['GENRE_BOTA'] = df_new['GENRE_BOTA'].map(str).replace('nan',' ')
            df_new['BIO'] = df_new['GENRE_BOTA'] + " " + df['ESPECE']
            df_new = df_new.drop(['GENRE_BOTA','ESPECE'],axis=1)
            return df_new
        else:
            return df

feature_adder = Pipeline(['Bio',Bio()])

## Transform Train

In [205]:
process_train_trees = preprocess_pipeline.fit_transform(trees_train)
annee_train = process_train_trees.ANNEEDEPLANTATION
predictors_train = process_train_trees.drop('ANNEEDEPLANTATION',axis=1)
process_train_trees

Unnamed: 0,SOUS_CATEGORIE_DESC,CODE_PARENT_DESC,ADR_SECTEUR,GENRE_BOTA,ESPECE,STADEDEDEVELOPPEMENT,ANNEEDEPLANTATION,COLLECTIVITE,longitude,latidude
0,Arbre d'espaces ouverts,Pc Savane local pétanque,3,Acer,platanoides,Arbre adulte,1981.0,Ville de Grenoble,5.727096,45.165976
1,Arbre de voirie,Pl André Malraux bord du park,2,Tilia,cordata,Arbre adulte,2000.0,Grenoble Alpes Métropole,5.735384,45.171467
2,Arbre d'enceintes fermées,Cimetière du Grand Sablon,2,Libocedrus,decurrens,Arbre adulte,1984.0,Ville de Grenoble,5.736978,45.185366
3,Arbre d'espaces ouverts,Pc Ch Elysés Ouest,3,Celtis,australis,Arbre adulte,2009.0,Ville de Grenoble,5.719141,45.160322
4,Arbre d'espaces ouverts,Prc Mis Est Palais des sports,5,Paulownia,fortunei,Arbre adulte,2012.0,Ville de Grenoble,5.736713,45.185557
...,...,...,...,...,...,...,...,...,...,...
16727,Arbre d'espaces ouverts,Prc Mis dans vivaces Mairie,5,Cercis,siliquastrum,Arbre adulte,1965.0,Ville de Grenoble,5.727376,45.190731
16728,Arbre d'espaces ouverts,R Arlequin pourtour silo 03,6,Betula,alba,Arbre adulte,1975.0,Ville de Grenoble,5.722941,45.164626
16729,Arbre de voirie,R Capitaine Camine park,3,Platanus,acerifolia,Arbre jeune,1981.0,Grenoble Alpes Métropole,5.744174,45.175887
16730,Arbre d'espaces ouverts,Ch Gordes Est conservatoire,4,Malus,domestica,Arbre adulte,2014.0,Ville de Grenoble,5.738849,45.167305


# Modelisation

In [206]:
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')
numerical_transformer = StandardScaler()

transformer = ColumnTransformer(
[("num_trans", numerical_transformer, make_column_selector(dtype_exclude="object")),
("cat_trans", categorical_transformer, make_column_selector(dtype_include='object'))])

In [210]:
model_1 = RandomForestRegressor()
param_1 = {'f_adder_1__to_use':[False],
           'model__n_estimators':[100],
           'model':[model_1]} 
model_2 = DecisionTreeRegressor()
param_2 = {'f_adder_1__to_use':[False],
           'model__min_samples_leaf':[1],
           'model':[model_2]} 
model_3 = SVR()
param_3 = {'f_adder_1__to_use':[False],
           'model__C':[50],
           'model':[model_3]} 


In [211]:
pipe = Pipeline(steps=[('f_adder_1',Bio()),
                       ("transformer",transformer),
                       ("model",model_1)])
params_list = [param_1,param_2,param_3]

grid_search = GridSearchCV(pipe,params_list,cv=10,scoring='neg_mean_squared_error',n_jobs=-1,refit=True)

In [212]:
grid_search.fit(predictors_train,annee_train)
grid_results = pd.DataFrame(grid_search.cv_results_)
grid_results.sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_f_adder_1__to_use,param_model,param_model__n_estimators,param_model__min_samples_leaf,param_model__C,params,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
2,299.013554,41.54678,1.140928,0.274221,False,SVR(C=50),,,50.0,"{'f_adder_1__to_use': False, 'model': SVR(C=50...",...,-30.46815,-36.99714,-32.980925,-34.126169,-47.229215,-38.069096,-38.149268,-36.196979,4.580032,1
0,60.54304,0.798963,0.063508,0.001211,False,RandomForestRegressor(),100.0,,,"{'f_adder_1__to_use': False, 'model': RandomFo...",...,-37.448911,-39.241409,-37.496902,-35.41591,-58.553084,-49.091066,-44.431133,-41.489019,7.54319,2
1,0.850804,0.014715,0.01607,0.000389,False,DecisionTreeRegressor(),,1.0,,"{'f_adder_1__to_use': False, 'model': Decision...",...,-58.305439,-71.517633,-63.126121,-52.939629,-67.063359,-73.838613,-61.997609,-60.516699,8.955468,3


In [213]:
best_model = grid_search.best_params_

In [214]:
best_pipe = grid_search.best_estimator_

In [215]:
best_pipe.score(predictors_train,annee_train)

0.95529545717458

# Test of best model on test dataset

In [216]:
process_test_trees = preprocess_pipeline.transform(trees_test)
annee_test = process_test_trees.ANNEEDEPLANTATION
predictors_test = process_test_trees.drop('ANNEEDEPLANTATION',axis=1)

In [217]:
annee_predicted = best_pipe.predict(predictors_test)
mean_squared_error(annee_test,annee_predicted)

45.322795761929186

In [218]:
best_pipe.score(predictors_test,annee_test)

0.8430693695903801

# Prediction on na values

In [219]:
path_first_na = '../data/modified/trees_na.pkl'
trees_na= pd.read_pickle(path_first_na)

In [220]:
trees_na

Unnamed: 0,ELEM_POINT_ID,CODE,NOM,GENRE,GENRE_DESC,CATEGORIE,CATEGORIE_DESC,SOUS_CATEGORIE,SOUS_CATEGORIE_DESC,CODE_PARENT,...,COURRIER,IDENTIFIANTPLU,TYPEIMPLANTATIONPLU,INTITULEPROTECTIONPLU,ANNEEABATTAGE,ESSOUCHEMENT,DIAMETREARBRE,CAUSEABATTAGE,COLLECTIVITE,GeoJSON
1075,39588,ESP38596,ESP38596,VEG,VEGETATION,ESP01,Arbre,ESP174,Arbre d'espaces ouverts,ESP1185,...,,,,,,,,,Ville de Grenoble,"{""type"":""Point"",""coordinates"":[5.7033870348283..."
1076,39589,ESP38597,ESP38597,VEG,VEGETATION,ESP01,Arbre,ESP174,Arbre d'espaces ouverts,ESP1185,...,,,,,,,,,Ville de Grenoble,"{""type"":""Point"",""coordinates"":[5.7034657665144..."
1077,39590,ESP38598,ESP38598,VEG,VEGETATION,ESP01,Arbre,ESP174,Arbre d'espaces ouverts,ESP108,...,,,,,,,,,Ville de Grenoble,"{""type"":""Point"",""coordinates"":[5.7199743332099..."
1088,43402,ESP39987,ESP39987,VEG,VEGETATION,ESP01,Arbre,ESP174,Arbre d'espaces ouverts,ESP273,...,,,,,,,,,Ville de Grenoble,"{""type"":""Point"",""coordinates"":[5.7139485923092..."
1089,43403,ESP39988,ESP39988,VEG,VEGETATION,ESP01,Arbre,ESP174,Arbre d'espaces ouverts,ESP273,...,,,,,,,,,Ville de Grenoble,"{""type"":""Point"",""coordinates"":[5.7138453760140..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31603,46019,ESP41553,ESP41553,VEG,VEGETATION,ESP01,Arbre,ESP174,Arbre d'espaces ouverts,ESP360,...,,,,,,,,,,"{""type"":""Point"",""coordinates"":[5.7358259784285..."
31604,46020,ESP41554,ESP41554,VEG,VEGETATION,ESP01,Arbre,ESP174,Arbre d'espaces ouverts,ESP360,...,,,,,,,,,,"{""type"":""Point"",""coordinates"":[5.7356836831762..."
31605,46021,ESP41555,ESP41555,VEG,VEGETATION,ESP01,Arbre,ESP174,Arbre d'espaces ouverts,ESP593,...,,,,,,,,,,"{""type"":""Point"",""coordinates"":[5.7362096829728..."
31606,46022,ESP41556,ESP41556,VEG,VEGETATION,ESP01,Arbre,ESP174,Arbre d'espaces ouverts,ESP1034,...,,,,,,,,,,"{""type"":""Point"",""coordinates"":[5.7355397923023..."


In [232]:
trees_na.isna().sum()

ELEM_POINT_ID               0
CODE                        0
NOM                         0
GENRE                       0
GENRE_DESC                  0
CATEGORIE                   0
CATEGORIE_DESC              0
SOUS_CATEGORIE              0
SOUS_CATEGORIE_DESC         0
CODE_PARENT                 0
CODE_PARENT_DESC            0
ADR_SECTEUR                 0
BIEN_REFERENCE              0
GENRE_BOTA                767
ESPECE                    946
VARIETE                  1331
STADEDEDEVELOPPEMENT     1034
EQUIPE                   1352
REMARQUES                1272
RAISONDEPLANTATION       1324
TRAITEMENTCHENILLES      1352
COURRIER                 1352
IDENTIFIANTPLU           1352
TYPEIMPLANTATIONPLU      1352
INTITULEPROTECTIONPLU    1352
ANNEEABATTAGE            1352
ESSOUCHEMENT             1352
DIAMETREARBRE            1352
CAUSEABATTAGE            1352
COLLECTIVITE              471
GeoJSON                     0
dtype: int64

In [221]:
trees_na = trees_na.astype({'ADR_SECTEUR':'object'})

In [222]:
trees_na = trees_na.drop(['ANNEEDEPLANTATION'],axis=1)

In [225]:
processed_na = preprocess_pipeline.transform(trees_na)

In [230]:
processed_na

Unnamed: 0,SOUS_CATEGORIE_DESC,CODE_PARENT_DESC,ADR_SECTEUR,GENRE_BOTA,ESPECE,STADEDEDEVELOPPEMENT,COLLECTIVITE,longitude,latidude
0,Arbre d'espaces ouverts,Ch 3 maisonsi th de création,1,Zelkova,serrata,Arbre adulte,Ville de Grenoble,5.739586,45.160349
1,Arbre d'espaces ouverts,Ch 3 maisonsi th de création,1,Zelkova,serrata,Arbre adulte,Ville de Grenoble,5.739644,45.160366
2,Arbre d'espaces ouverts,Jard des Dauphins le labo,2,Maclura,pomifera,Arbre adulte,Ville de Grenoble,5.739702,45.160382
3,Arbre d'espaces ouverts,Parc Paul Valérien Perrin,1,,,Arbre jeune,Ville de Grenoble,5.696772,45.208113
4,Arbre d'espaces ouverts,Parc Paul Valérien Perrin,1,,,Arbre jeune,Ville de Grenoble,5.696741,45.208129
5,Arbre d'espaces ouverts,Parc Paul Valérien Perrin,1,,,Arbre jeune,Ville de Grenoble,5.696714,45.20815
6,Arbre d'espaces ouverts,Parc Paul Valérien Perrin,1,,,Arbre jeune,Ville de Grenoble,5.696645,45.208111
7,Arbre d'espaces ouverts,Parc Paul Valérien Perrin,1,,,Arbre jeune,Ville de Grenoble,5.69663,45.208158
8,Arbre d'enceintes fermées,Maison des Collines Jard Poisat,6,Acer,,,Ville de Grenoble,5.736374,45.164117
9,Arbre d'enceintes fermées,Maison des Collines Jard Poisat,6,Acer,,,Ville de Grenoble,5.736473,45.164138


In [229]:
processed_na.isna().sum()

SOUS_CATEGORIE_DESC      0
CODE_PARENT_DESC         0
ADR_SECTEUR              0
GENRE_BOTA              13
ESPECE                  21
STADEDEDEVELOPPEMENT    23
COLLECTIVITE             8
longitude                0
latidude                 0
dtype: int64

In [226]:
best_pipe.predict(processed_na)

array([2002.97141179, 2002.97083192, 1990.38832261, 1998.38222781,
       1998.36969482, 1998.35828428, 1998.34011239, 1998.32908124,
       1989.22163289, 1989.21988349, 1989.224988  , 1989.22210269,
       1987.00851556, 1994.40811958, 1998.06630603, 1994.15017736,
       1994.9311912 , 1994.15094172, 1994.95365312, 1994.96317821,
       1994.97115177, 1994.952429  , 1998.23820148, 1987.11648051,
       1995.04784752, 1995.03981651, 1995.03883634, 1980.63657987,
       1993.97321879, 1993.98242768, 1993.92787515, 1992.12250182,
       2021.81974744, 2022.25973052, 2022.2682545 , 2022.22691321,
       2014.11699154, 2023.2393485 , 2022.690725  ])