In [1]:
import pandas as pd
import matplotlib.pyplot as plt
# import sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted

# sklearn imputers
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# sklearn models
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR


In [2]:
path = '../data/modified/trees_first.pkl'
trees= pd.read_pickle(path)

In [3]:
trees = trees.astype({'ADR_SECTEUR':'object'})

trees_train,trees_test = train_test_split(trees,random_state=2708)

# Transform data

## Preprocessing

In [18]:
# Transforme GEOJSON data in two features
class TransformGEOJSON(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    def fit(self, df):
        return self
    def transform(self, df):
        df_new = df.copy()
        pat = '([0-9]+.[0-9]+)'
        df_new[['latitude','longitude']] = df_new.GeoJSON.str.extractall(pat).unstack(level=1).astype('float64')
        df_new = df_new.drop('GeoJSON',axis=1)
        return df_new

# Drop features without info (1) or identifiers features
class DropuniqueCol(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    def fit(self, df):
        return self
    def transform(self, df):
        to_drop = ['ELEM_POINT_ID','CODE','NOM','GENRE','GENRE_DESC','CATEGORIE','CATEGORIE_DESC','BIEN_REFERENCE','SOUS_CATEGORIE','CODE_PARENT']
        df_new = df.drop(to_drop,axis=1)
        return df_new

# Drop Na features     
class DropNaWithSaveCol(BaseEstimator,TransformerMixin):
    def __init__(self,na_threshold=0.5):
        self.threshold = na_threshold
    def fit(self, df):
        na = df.isna().sum().div(len(df)).sort_values(ascending=False)
        self.col_na = na.where(na>=self.threshold).dropna().index
        return self
    def transform(self, df):
        check_is_fitted(self, ['col_na'])
        df_new= df.drop(self.col_na,axis=1)
        return df_new

class CustomImputer(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    def fit(self,df):
        self.collectivite = df.collectivite. 
        self.genre =
        self.espece =
        return self
    def transform(self,df):
        check_is_fitted(self,['collectivite','genre','espece'])
        df_new = df.copy()
        return df_new
    
preprocess_pipeline = Pipeline([("geo", TransformGEOJSON()),
                                ('dropuni',DropuniqueCol()),
                                ('na',DropNaWithSaveCol())
                               ])

SyntaxError: invalid syntax (3647245263.py, line 42)

In [21]:
trees.COLLECTIVITE.value_counts()

Ville de Grenoble           15651
Grenoble Alpes Métropole    14607
Name: COLLECTIVITE, dtype: int64

## Feature add

In [7]:
# Create Bio features

class Bio(BaseEstimator,TransformerMixin):
    def __init__(self,to_use=True):
        self.to_use = to_use
    def fit(self, df,y=None):
        return self
    def transform(self,df):
        if self.to_use:
            df_new = df.copy()
            df_new['ESPECE'] = df_new['ESPECE'].map(str).replace('nan',' ')
            df_new['GENRE_BOTA'] = df_new['GENRE_BOTA'].map(str).replace('nan',' ')
            df_new['BIO'] = df_new['GENRE_BOTA'] + " " + df['ESPECE']
            df_new = df_new.drop(['GENRE_BOTA','ESPECE'],axis=1)
            return df_new
        else:
            return df
feature_adder = Pipeline(['Bio',Bio()])

## Transform Train

In [8]:
process_train_trees = preprocess_pipeline.fit_transform(trees_train)
annee_train = process_train_trees.ANNEEDEPLANTATION
predictors_train = process_train_trees.drop('ANNEEDEPLANTATION',axis=1)
process_train_trees

Unnamed: 0,SOUS_CATEGORIE_DESC,CODE_PARENT_DESC,ADR_SECTEUR,GENRE_BOTA,ESPECE,STADEDEDEVELOPPEMENT,ANNEEDEPLANTATION,COLLECTIVITE,latitude,longitude
9601,Arbre d'espaces ouverts,Pc Savane local pétanque,3,Acer,platanoides,Arbre adulte,1981.0,Ville de Grenoble,5.706195,45.175643
1784,Arbre de voirie,Pl André Malraux bord du park,2,Tilia,cordata,Arbre adulte,2000.0,Grenoble Alpes Métropole,5.729218,45.185658
12932,Arbre d'enceintes fermées,Cimetière du Grand Sablon,2,Libocedrus,decurrens,Arbre adulte,1984.0,Ville de Grenoble,5.744626,45.192890
17720,Arbre d'espaces ouverts,Pc Ch Elysés Ouest,3,Celtis,australis,Arbre adulte,2009.0,Ville de Grenoble,5.705889,45.163258
19655,Arbre d'espaces ouverts,Prc Mis Est Palais des sports,5,Paulownia,fortunei,Arbre adulte,2012.0,Ville de Grenoble,5.741748,45.186138
...,...,...,...,...,...,...,...,...,...,...
1751,Arbre d'espaces ouverts,Prc Mis dans vivaces Mairie,5,Cercis,siliquastrum,Arbre adulte,1965.0,Ville de Grenoble,5.736233,45.185974
9389,Arbre d'espaces ouverts,R Arlequin pourtour silo 03,6,Betula,alba,Arbre adulte,1975.0,Ville de Grenoble,5.731841,45.163756
17367,Arbre de voirie,R Capitaine Camine park,3,Platanus,acerifolia,Arbre jeune,1981.0,Grenoble Alpes Métropole,5.713157,45.173429
18855,Arbre d'espaces ouverts,Ch Gordes Est conservatoire,4,Malus,domestica,Arbre adulte,2014.0,Ville de Grenoble,5.737468,45.176643


# Modelisation

In [161]:
categorical_transformer = OneHotEncoder(sparse=False,handle_unknown = 'ignore')
numerical_transformer = StandardScaler()

transformer = ColumnTransformer(
[("num_trans", numerical_transformer, make_column_selector(dtype_exclude="object")),
("cat_trans", categorical_transformer, make_column_selector(dtype_include='object'))])

In [108]:
model_1 = RandomForestRegressor()
param_1 = {'f_adder_1__to_use':[False],
           'model__n_estimators':[100],
           'model':[model_1]} 
model_2 = DecisionTreeRegressor()
param_2 = {'f_adder_1__to_use':[False],
           'model__min_samples_leaf':[1],
           'model':[model_2]} 
model_3 = SVR()
param_3 = {'f_adder_1__to_use':[False],
           'model__C':[50],
           'model':[model_3]} 


In [109]:
pipe = Pipeline(steps=[('f_adder_1',Bio()),
                       ("transformer",transformer),
                       ("model",model_1)])
params_list = [param_1,param_2,param_3]

grid_search = GridSearchCV(pipe,params_list,cv=10,scoring='neg_mean_squared_error',n_jobs=-1,refit=True)

In [110]:
grid_search.fit(predictors_train,annee_train)
grid_results = pd.DataFrame(grid_search.cv_results_)
grid_results.sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_f_adder_1__to_use,param_model,param_model__n_estimators,param_model__min_samples_leaf,param_model__C,params,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
2,195.411849,17.73415,2.352733,0.528478,False,SVR(C=50),,,50.0,"{'f_adder_1__to_use': False, 'model': SVR(C=50...",...,-25.177822,-28.565861,-25.970055,-25.607326,-38.402479,-26.404696,-29.047023,-28.740047,5.096853,1
0,109.76804,1.548148,0.131678,0.009064,False,RandomForestRegressor(),100.0,,,"{'f_adder_1__to_use': False, 'model': RandomFo...",...,-28.860884,-29.711021,-26.193734,-26.25708,-37.915292,-32.997449,-31.560704,-29.744795,4.008415,2
1,1.853297,0.036983,0.036331,0.004202,False,DecisionTreeRegressor(),,1.0,,"{'f_adder_1__to_use': False, 'model': Decision...",...,-51.001763,-37.569414,-43.69502,-36.338034,-54.422212,-43.369766,-43.068753,-42.819272,6.603621,3


In [111]:
best_model = grid_search.best_params_

In [112]:
best_pipe = grid_search.best_estimator_

In [113]:
best_pipe.score(predictors_train,annee_train)

0.9535638547255602

# Test of best model on test dataset

In [114]:
process_test_trees = preprocess_pipeline.transform(trees_test)
annee_test = process_test_trees.ANNEEDEPLANTATION
predictors_test = process_test_trees.drop('ANNEEDEPLANTATION',axis=1)

In [115]:
annee_predicted = best_pipe.predict(predictors_test)
mean_squared_error(annee_test,annee_predicted)

27.400448173113237

In [116]:
best_pipe.score(predictors_test,annee_test)

0.9151427188940645

# Prediction on na values

In [10]:
path_first_na = '../data/modified/trees_first_na.pkl'
trees_na= pd.read_pickle(path_first_na)

In [11]:
trees_na

Unnamed: 0,ELEM_POINT_ID,CODE,NOM,GENRE,GENRE_DESC,CATEGORIE,CATEGORIE_DESC,SOUS_CATEGORIE,SOUS_CATEGORIE_DESC,CODE_PARENT,...,COURRIER,IDENTIFIANTPLU,TYPEIMPLANTATIONPLU,INTITULEPROTECTIONPLU,ANNEEABATTAGE,ESSOUCHEMENT,DIAMETREARBRE,CAUSEABATTAGE,COLLECTIVITE,GeoJSON
1075,39588,ESP38596,ESP38596,VEG,VEGETATION,ESP01,Arbre,ESP174,Arbre d'espaces ouverts,ESP1185,...,,,,,,,,,Ville de Grenoble,"{""type"":""Point"",""coordinates"":[5.7033870348283..."
1076,39589,ESP38597,ESP38597,VEG,VEGETATION,ESP01,Arbre,ESP174,Arbre d'espaces ouverts,ESP1185,...,,,,,,,,,Ville de Grenoble,"{""type"":""Point"",""coordinates"":[5.7034657665144..."
1077,39590,ESP38598,ESP38598,VEG,VEGETATION,ESP01,Arbre,ESP174,Arbre d'espaces ouverts,ESP108,...,,,,,,,,,Ville de Grenoble,"{""type"":""Point"",""coordinates"":[5.7199743332099..."
1088,43402,ESP39987,ESP39987,VEG,VEGETATION,ESP01,Arbre,ESP174,Arbre d'espaces ouverts,ESP273,...,,,,,,,,,Ville de Grenoble,"{""type"":""Point"",""coordinates"":[5.7139485923092..."
1089,43403,ESP39988,ESP39988,VEG,VEGETATION,ESP01,Arbre,ESP174,Arbre d'espaces ouverts,ESP273,...,,,,,,,,,Ville de Grenoble,"{""type"":""Point"",""coordinates"":[5.7138453760140..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31603,46019,ESP41553,ESP41553,VEG,VEGETATION,ESP01,Arbre,ESP174,Arbre d'espaces ouverts,ESP360,...,,,,,,,,,,"{""type"":""Point"",""coordinates"":[5.7358259784285..."
31604,46020,ESP41554,ESP41554,VEG,VEGETATION,ESP01,Arbre,ESP174,Arbre d'espaces ouverts,ESP360,...,,,,,,,,,,"{""type"":""Point"",""coordinates"":[5.7356836831762..."
31605,46021,ESP41555,ESP41555,VEG,VEGETATION,ESP01,Arbre,ESP174,Arbre d'espaces ouverts,ESP593,...,,,,,,,,,,"{""type"":""Point"",""coordinates"":[5.7362096829728..."
31606,46022,ESP41556,ESP41556,VEG,VEGETATION,ESP01,Arbre,ESP174,Arbre d'espaces ouverts,ESP1034,...,,,,,,,,,,"{""type"":""Point"",""coordinates"":[5.7355397923023..."


In [12]:
trees_na.isna().sum()

ELEM_POINT_ID               0
CODE                        0
NOM                         0
GENRE                       0
GENRE_DESC                  0
CATEGORIE                   0
CATEGORIE_DESC              0
SOUS_CATEGORIE              0
SOUS_CATEGORIE_DESC         0
CODE_PARENT                 0
CODE_PARENT_DESC            0
ADR_SECTEUR                 0
BIEN_REFERENCE              0
GENRE_BOTA                767
ESPECE                    946
VARIETE                  1331
STADEDEDEVELOPPEMENT     1034
EQUIPE                   1352
REMARQUES                1272
ANNEEDEPLANTATION        1352
RAISONDEPLANTATION       1324
TRAITEMENTCHENILLES      1352
COURRIER                 1352
IDENTIFIANTPLU           1352
TYPEIMPLANTATIONPLU      1352
INTITULEPROTECTIONPLU    1352
ANNEEABATTAGE            1352
ESSOUCHEMENT             1352
DIAMETREARBRE            1352
CAUSEABATTAGE            1352
COLLECTIVITE              471
GeoJSON                     0
dtype: int64

In [13]:
trees_na = trees_na.astype({'ADR_SECTEUR':'object'})

In [14]:
trees_na = trees_na.drop(['ANNEEDEPLANTATION'],axis=1)

In [15]:
processed_na = preprocess_pipeline.transform(trees_na)

In [16]:
processed_na

Unnamed: 0,SOUS_CATEGORIE_DESC,CODE_PARENT_DESC,ADR_SECTEUR,GENRE_BOTA,ESPECE,STADEDEDEVELOPPEMENT,COLLECTIVITE,latitude,longitude
1075,Arbre d'espaces ouverts,Ch 3 maisonsi th de création,1,Zelkova,serrata,Arbre adulte,Ville de Grenoble,5.703387,45.189097
1076,Arbre d'espaces ouverts,Ch 3 maisonsi th de création,1,Zelkova,serrata,Arbre adulte,Ville de Grenoble,5.703466,45.189104
1077,Arbre d'espaces ouverts,Jard des Dauphins le labo,2,Maclura,pomifera,Arbre adulte,Ville de Grenoble,5.719974,45.194390
1088,Arbre d'espaces ouverts,Parc Paul Valérien Perrin,1,,,Arbre jeune,Ville de Grenoble,5.713949,45.182986
1089,Arbre d'espaces ouverts,Parc Paul Valérien Perrin,1,,,Arbre jeune,Ville de Grenoble,5.713845,45.183063
...,...,...,...,...,...,...,...,...,...
31603,Arbre d'espaces ouverts,Parc 2 R Herminier centre,2,,,,,5.735826,45.195394
31604,Arbre d'espaces ouverts,Parc 2 R Herminier centre,2,,,,,5.735684,45.195362
31605,Arbre d'espaces ouverts,Parc 2 R Herminier ch de Ronde,2,,,,,5.736210,45.195236
31606,Arbre d'espaces ouverts,Parc 2 R Hermin M Leclerq,2,,,,,5.735540,45.195325


In [17]:
processed_na.isna().sum().div(len(processed_na))

SOUS_CATEGORIE_DESC     0.000000
CODE_PARENT_DESC        0.000000
ADR_SECTEUR             0.000000
GENRE_BOTA              0.567308
ESPECE                  0.699704
STADEDEDEVELOPPEMENT    0.764793
COLLECTIVITE            0.348373
latitude                0.000000
longitude               0.000000
dtype: float64

In [226]:
best_pipe.predict(processed_na)

array([2002.97141179, 2002.97083192, 1990.38832261, 1998.38222781,
       1998.36969482, 1998.35828428, 1998.34011239, 1998.32908124,
       1989.22163289, 1989.21988349, 1989.224988  , 1989.22210269,
       1987.00851556, 1994.40811958, 1998.06630603, 1994.15017736,
       1994.9311912 , 1994.15094172, 1994.95365312, 1994.96317821,
       1994.97115177, 1994.952429  , 1998.23820148, 1987.11648051,
       1995.04784752, 1995.03981651, 1995.03883634, 1980.63657987,
       1993.97321879, 1993.98242768, 1993.92787515, 1992.12250182,
       2021.81974744, 2022.25973052, 2022.2682545 , 2022.22691321,
       2014.11699154, 2023.2393485 , 2022.690725  ])