# Trees in Paris

Let's first check that we are using a virtual environment

In [68]:
import os
assert os.environ.get('VIRTUAL_ENV', None) is not None, "Virtual environment is not activated. Please run this notebook after activation"
print('Virtual environment is activated')    

Virtual environment is activated


In [99]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing

Now we can import the data from the csv file and have a first look at it:

In [69]:
arbres = pd.read_csv('p2-arbres-fr.csv', sep=";")
shape = arbres.shape
print(f'\nDataframe contains {arbres.shape[0]} rows and {shape[1]} columns.\n\n')
arbres.head()


Dataframe contains 200137 rows and 18 columns.




Unnamed: 0,id,type_emplacement,domanialite,arrondissement,complement_addresse,numero,lieu,id_emplacement,libelle_francais,genre,espece,variete,circonference_cm,hauteur_m,stade_developpement,remarquable,geo_point_2d_a,geo_point_2d_b
0,99874,Arbre,Jardin,PARIS 7E ARRDT,,,MAIRIE DU 7E 116 RUE DE GRENELLE PARIS 7E,19,Marronnier,Aesculus,hippocastanum,,20,5,,0.0,48.85762,2.320962
1,99875,Arbre,Jardin,PARIS 7E ARRDT,,,MAIRIE DU 7E 116 RUE DE GRENELLE PARIS 7E,20,If,Taxus,baccata,,65,8,A,,48.857656,2.321031
2,99876,Arbre,Jardin,PARIS 7E ARRDT,,,MAIRIE DU 7E 116 RUE DE GRENELLE PARIS 7E,21,If,Taxus,baccata,,90,10,A,,48.857705,2.321061
3,99877,Arbre,Jardin,PARIS 7E ARRDT,,,MAIRIE DU 7E 116 RUE DE GRENELLE PARIS 7E,22,Erable,Acer,negundo,,60,8,A,,48.857722,2.321006
4,99878,Arbre,Jardin,PARIS 17E ARRDT,,,PARC CLICHY-BATIGNOLLES-MARTIN LUTHER KING,000G0037,Arbre à miel,Tetradium,daniellii,,38,0,,,48.890435,2.315289


Each tree already has an index, so we can use it as the Dataframe index:

In [70]:
arbres.set_index('id', inplace=True)

In [71]:
arbres.head()

Unnamed: 0_level_0,type_emplacement,domanialite,arrondissement,complement_addresse,numero,lieu,id_emplacement,libelle_francais,genre,espece,variete,circonference_cm,hauteur_m,stade_developpement,remarquable,geo_point_2d_a,geo_point_2d_b
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
99874,Arbre,Jardin,PARIS 7E ARRDT,,,MAIRIE DU 7E 116 RUE DE GRENELLE PARIS 7E,19,Marronnier,Aesculus,hippocastanum,,20,5,,0.0,48.85762,2.320962
99875,Arbre,Jardin,PARIS 7E ARRDT,,,MAIRIE DU 7E 116 RUE DE GRENELLE PARIS 7E,20,If,Taxus,baccata,,65,8,A,,48.857656,2.321031
99876,Arbre,Jardin,PARIS 7E ARRDT,,,MAIRIE DU 7E 116 RUE DE GRENELLE PARIS 7E,21,If,Taxus,baccata,,90,10,A,,48.857705,2.321061
99877,Arbre,Jardin,PARIS 7E ARRDT,,,MAIRIE DU 7E 116 RUE DE GRENELLE PARIS 7E,22,Erable,Acer,negundo,,60,8,A,,48.857722,2.321006
99878,Arbre,Jardin,PARIS 17E ARRDT,,,PARC CLICHY-BATIGNOLLES-MARTIN LUTHER KING,000G0037,Arbre à miel,Tetradium,daniellii,,38,0,,,48.890435,2.315289


Let's go further and check the kind(and number) of values we have:

In [73]:
for column in arbres.columns:
    unique_values = arbres[column].unique()
    print(f'{column} ({len(unique_values)} values (first 25 displayed)):\n{arbres[column].unique()[:25]}\n')

type_emplacement (1 values (first 25 displayed)):
['Arbre']

domanialite (10 values (first 25 displayed)):
['Jardin' 'Alignement' 'DJS' 'DFPE' 'CIMETIERE' 'DASCO' 'DAC'
 'PERIPHERIQUE' 'DASES' nan]

arrondissement (25 values (first 25 displayed)):
['PARIS 7E ARRDT' 'PARIS 17E ARRDT' 'PARIS 16E ARRDT' 'PARIS 4E ARRDT'
 'PARIS 13E ARRDT' 'PARIS 12E ARRDT' 'PARIS 19E ARRDT' 'PARIS 14E ARRDT'
 'PARIS 15E ARRDT' 'PARIS 3E ARRDT' 'PARIS 20E ARRDT' 'PARIS 18E ARRDT'
 'PARIS 6E ARRDT' 'PARIS 11E ARRDT' 'PARIS 1ER ARRDT' 'PARIS 2E ARRDT'
 'PARIS 5E ARRDT' 'VAL-DE-MARNE' 'SEINE-SAINT-DENIS' 'HAUTS-DE-SEINE'
 'PARIS 9E ARRDT' 'PARIS 10E ARRDT' 'PARIS 8E ARRDT' 'BOIS DE BOULOGNE'
 'BOIS DE VINCENNES']

complement_addresse (3796 values (first 25 displayed)):
[nan 'c 12' '12-36' '12-35' 'face 64 Manin' '19-02' '19-06' '19-07'
 '19-09' '19-08' '19-10' '82 à 90' '19-04' '19-16' '19-12' '19-11' '19-03'
 '13-04' '13-01' '13-03' '07-09' '07-06' '07-05' '07-04' '70 à 80']

numero (1 values (first 25 displ

We notice that columns "type_emplacement" all have same value "arbre", and "numero" column has no value, which makes these fields not useful for the analysis. So we can drop them.

In [88]:
arbres.drop(['type_emplacement', 'numero'], axis=1, inplace=True)
arbres.head()

Unnamed: 0_level_0,domanialite,arrondissement,complement_addresse,lieu,id_emplacement,libelle_francais,genre,espece,variete,circonference_cm,hauteur_m,stade_developpement,remarquable,geo_point_2d_a,geo_point_2d_b
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
99874,Jardin,PARIS 7E ARRDT,,MAIRIE DU 7E 116 RUE DE GRENELLE PARIS 7E,19,Marronnier,Aesculus,hippocastanum,,20,5,,0.0,48.85762,2.320962
99875,Jardin,PARIS 7E ARRDT,,MAIRIE DU 7E 116 RUE DE GRENELLE PARIS 7E,20,If,Taxus,baccata,,65,8,A,,48.857656,2.321031
99876,Jardin,PARIS 7E ARRDT,,MAIRIE DU 7E 116 RUE DE GRENELLE PARIS 7E,21,If,Taxus,baccata,,90,10,A,,48.857705,2.321061
99877,Jardin,PARIS 7E ARRDT,,MAIRIE DU 7E 116 RUE DE GRENELLE PARIS 7E,22,Erable,Acer,negundo,,60,8,A,,48.857722,2.321006
99878,Jardin,PARIS 17E ARRDT,,PARC CLICHY-BATIGNOLLES-MARTIN LUTHER KING,000G0037,Arbre à miel,Tetradium,daniellii,,38,0,,,48.890435,2.315289


In [89]:
arbres.describe(include='all')

Unnamed: 0,domanialite,arrondissement,complement_addresse,lieu,id_emplacement,libelle_francais,genre,espece,variete,circonference_cm,hauteur_m,stade_developpement,remarquable,geo_point_2d_a,geo_point_2d_b
count,200136,200137,30902,200137,200137.0,198640,200121,198385,36777,200137.0,200137.0,132932,137039.0,200137.0,200137.0
unique,9,25,3795,6921,69040.0,192,175,539,436,,,4,,,
top,Alignement,PARIS 15E ARRDT,SN°,PARC FLORAL DE PARIS / ROUTE DE LA PYRAMIDE,101001.0,Platane,Platanus,x hispanica,Baumannii',,,A,,,
freq,104949,17151,557,2995,1324.0,42508,42591,36409,4538,,,64438,,,
mean,,,,,,,,,,83.380479,13.110509,,0.001343,48.854491,2.348208
std,,,,,,,,,,673.190213,1971.217387,,0.036618,0.030234,0.05122
min,,,,,,,,,,0.0,0.0,,0.0,48.74229,2.210241
25%,,,,,,,,,,30.0,5.0,,0.0,48.835021,2.30753
50%,,,,,,,,,,70.0,8.0,,0.0,48.854162,2.351095
75%,,,,,,,,,,115.0,12.0,,0.0,48.876447,2.386838


## TODO:
- process Nan values
- plot boxplot and find outliers on dimensions 
- find most common / rarest varieties
- scatter plot for circonference_cm / hauteur_m (after normalization)
- are there clusters ? Can it be related to species ?
- confusion matrix to detect correlations (only for numerical data)
- Folium map ?

In [None]:
circonference = arbres['circonference_cm'].reshape(-1,1)
hauteur = arbres['hauteur_m']
scaler = preprocessing.StandardScaler()
scaled_circonference = scaler.fit(circonference)
