# RF + OHE

## Import

In [7]:
import pandas as pd
import numpy as np

# import du csv
data = pd.read_csv('csv_correction.csv')

## Préparation des données

In [8]:
from sklearn import preprocessing, feature_selection, model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import cross_val_predict, cross_val_score
import time
from imblearn.over_sampling import SMOTE


In [9]:
indexDrop = data[((data['fk_arb_etat'] == 'ABATTU') | (data['fk_arb_etat'] == 'EN PLACE') | (data['fk_arb_etat']=='REMPLACÉ') | (data['fk_arb_etat'] == 'SUPPRIMÉ') )].index
indexDrop

Index([   0,    1,    2,    3,    5,    6,    7,    8,    9,   10,
       ...
       7399, 7400, 7401, 7402, 7403, 7404, 7405, 7406, 7407, 7408],
      dtype='int64', length=7217)

In [10]:
data.drop(indexDrop, inplace = True)

In [11]:
data["fk_arb_etat"].value_counts()

fk_arb_etat
Essouché        165
Non essouché     27
Name: count, dtype: int64

In [12]:
from sklearn.preprocessing import OneHotEncoder

In [13]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform='pandas')

In [14]:
categorical_data_cols = ['clc_quartier', 'clc_secteur', 'fk_stadedev', 'fk_port', 'fk_pied', 'fk_situation',  'fk_nomtech', 'villeca', 'feuillage']
boolean_cols = ['fk_revetement', 'remarquable']

In [15]:
dict_encoders = {}
for col in categorical_data_cols:
  ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform='pandas')
  ohe.fit(data[[col]])
  dict_encoders[col] = ohe
  print(type(dict_encoders[col]))
  print(col)
  ohetransform = ohe.transform(data[[col]])
  data = data.join(ohetransform)

data.drop(categorical_data_cols, axis = 1, inplace = True) 
data

<class 'sklearn.preprocessing._encoders.OneHotEncoder'>
clc_quartier
<class 'sklearn.preprocessing._encoders.OneHotEncoder'>
clc_secteur
<class 'sklearn.preprocessing._encoders.OneHotEncoder'>
fk_stadedev
<class 'sklearn.preprocessing._encoders.OneHotEncoder'>
fk_port
<class 'sklearn.preprocessing._encoders.OneHotEncoder'>
fk_pied
<class 'sklearn.preprocessing._encoders.OneHotEncoder'>
fk_situation
<class 'sklearn.preprocessing._encoders.OneHotEncoder'>
fk_nomtech
<class 'sklearn.preprocessing._encoders.OneHotEncoder'>
villeca
<class 'sklearn.preprocessing._encoders.OneHotEncoder'>
feuillage


Unnamed: 0,longitude,latitude,haut_tot,haut_tronc,tronc_diam,fk_arb_etat,fk_revetement,age_estim,fk_prec_estim,clc_nbr_diag,...,fk_nomtech_ROBPSE,fk_nomtech_SALBAB,fk_nomtech_SORAUC,fk_nomtech_TAXDIS,fk_nomtech_TILCOR,fk_nomtech_TILTOM,villeca_CASQ,villeca_VILLE,feuillage_Conifère,feuillage_Feuillu
4,3.304047,49.858446,5.0,2.0,170.0,Essouché,Non,40.0,2.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
73,3.270746,49.844440,12.0,3.0,160.0,Essouché,Non,50.0,10.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
76,3.270951,49.844588,13.0,4.0,155.0,Essouché,Non,50.0,10.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
92,3.294703,49.849927,24.0,9.0,245.0,Essouché,Non,80.0,20.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
104,3.294280,49.849615,20.0,14.0,180.0,Essouché,Non,80.0,20.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6886,3.296399,49.848276,9.0,3.0,85.0,Essouché,Non,25.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
6887,3.296366,49.848250,9.0,3.0,86.0,Essouché,Non,25.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
6888,3.296433,49.848229,9.0,4.0,98.0,Essouché,Non,25.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
6977,3.265997,49.836176,5.0,2.0,25.0,Non essouché,Non,10.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [16]:
for col in boolean_cols:
  new_col = []
  for cell in data[col]:
    new_col.append(1 if cell=="Oui" else 0)
  print(col)
  print(new_col)
  data[col] = new_col

fk_revetement
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
remarquable
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

## Split

In [17]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(data.drop(['fk_arb_etat'],axis = 1), data["fk_arb_etat"], test_size=0.2, random_state=42, stratify=data["fk_arb_etat"])

## Fit

In [18]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

## Validation

In [19]:
predicted_train = rf.predict(X_train)

In [20]:
confusion_matrix(y_train, predicted_train, normalize="true")

array([[1., 0.],
       [0., 1.]])

## Tests

In [21]:
y_test_predicted = rf.predict(X_test)
confusion_matrix(y_test, y_test_predicted, normalize="true")

array([[1. , 0. ],
       [0.6, 0.4]])