In [1]:
import pandas as pd
from datetime import *
import seaborn as sns
from matplotlib.mlab import PCA as mlabPCA
import numpy as np

from matplotlib import pyplot as plt

from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d import proj3d

import Soils
soils = Soils.Soils()


%matplotlib inline

In [2]:
# Read the CSV file (utf-8 encoded)
dfsoil = pd.read_csv('DataWithClimat_v2_utf-8_merged_soils.csv', header=0, sep=',', index_col=0, parse_dates=False, encoding='utf-8')
df = pd.read_csv('DataWithClimat_v2_utf-8.csv', header=0, sep=',', index_col=0, parse_dates=False, encoding='utf-8')



# Dictionnaire SICA - Code de profil

In [3]:
profil_dict = dict(zip(dfsoil.SICA_1, dfsoil.Unidad_Cod))
unidad_dict = dict(zip(dfsoil.SICA_1, dfsoil.Unidad_c_1))

In [4]:
# Main dataset SICA access: df['SICA 1']

df['Soil Profile'] = ""
df['Unidad_c_1'] = ""

for index, row in df.iterrows():
    print("SICA: {}  ".format(int(row['SICA 1'])))
    df.set_value(index,'Soil Profile', profil_dict[int(row['SICA 1'])])
    df.set_value(index, 'Unidad_c_1', unidad_dict[int(row['SICA 1'])])

SICA: 6607500996  
SICA: 6607500981  
SICA: 6607501034  
SICA: 6607500081  
SICA: 6607500108  
SICA: 6607500309  
SICA: 6607500014  
SICA: 6607500148  
SICA: 6607500516  
SICA: 6607501187  
SICA: 6607500304  
SICA: 6607500533  
SICA: 6607501128  
SICA: 6607500809  
SICA: 6638301425  
SICA: 6607500288  
SICA: 6607500769  
SICA: 6607500645  
SICA: 6607500010  
SICA: 6607500020  
SICA: 6638300269  
SICA: 6638300654  
SICA: 6638300700  
SICA: 6638300915  
SICA: 6638300093  
SICA: 6638301152  
SICA: 6638300126  
SICA: 6638300128  
SICA: 6638300130  
SICA: 6638300131  
SICA: 6638300158  
SICA: 6638300302  
SICA: 6638300328  
SICA: 6638301680  
SICA: 6638300310  
SICA: 6638300311  
SICA: 6638300602  
SICA: 6638300608  
SICA: 6600102437  
SICA: 6600103765  
SICA: 6600100265  
SICA: 6600101458  
SICA: 6600102066  
SICA: 6600103421  
SICA: 6600100722  
SICA: 6600100431  
SICA: 6600102095  
SICA: 6600101389  
SICA: 6600101064  
SICA: 6600102480  
SICA: 6600101163  
SICA: 6600101173  
SICA: 660010

# Récupérer les informations de sol

In [5]:
toDelete = []
all_soils_profiles = []

# Initialisation des colonnes vides pour la texture et les moyennes de ph et org
soil_columns_names = ["pH_avg","org_avg","franco_L1","arcilloso_L1","limoso_L1","arenoso_L1","cascajoso_L1",
"franco_L2","arcilloso_L2","limoso_L2","arenoso_L2","cascajoso_L2",
"franco_L3","arcilloso_L3","limoso_L3","arenoso_L3","cascajoso_L3"]

for e in soil_columns_names:
    df[e] = 0.


for index, row in df.iterrows():
    
    profileCode = row['Soil Profile']
    unidad = row['Unidad_c_1']
    
    if profileCode == " " or profileCode == "":
        toDelete.append(index)
        continue
    
    # Les données de ces profils sont incomplètes (totalement) 
    if profileCode == "66PUM" or profileCode == "66PUL" or profileCode == "66MCA" or profileCode == "66VEI":
        toDelete.append(index)
        continue
        
    # Les codes ne correspondent pas (plusieurs profils) on prend donc le seul profil existant pour 66PAR, 66OSP et 66CHU
    if profileCode == "66PAR":
        profileCode = "66PARP01"
    if profileCode == "66CHU":
        profileCode = "66CHUP01"
    if profileCode == "66OSP":
        profileCode = "66OSP01"
    
    # pour 66CAT, il existe 2 profils. On suppose ici que le champ 
    # unidad_c_1 (qui contient soit un 1 soit un 2) correspond au profil nécessaire
    
    # on a soit CAT1(A)cde ou soit CAT2(A)de
    
    if profileCode == "66CAT":
        if unidad[3] == "1":
            profileCode = "66CATP01"
            # Attention 66CATP01 n'a pas d'informations dans la 2ème couche
            
        elif unidad[3] == "2":
            profileCode = "66CATP02"
        else:
            print("Erreur avec code: {}".format(profileCode))
    
    
    # Get the different layers (key)
    layers = soils.getLayers(profileCode)
    
    # Récupérer les informations de chaque couches pour analyse. 
    soil = Soils.Soil(profileCode)
    
    # soil contient les informations de chaque couches (ph, organic, depth, etc)
    all_soils_profiles.append(soil) # pour étudier plus bas
    
    
    # Pour chaque couche on peut extraire l'information désirée
    nbLayer = soil.nbLayers
    if profileCode == "66CATP01" or profileCode == "66CHUP01":
        nbLayer -= 1
    
    # valeurs sur plusieurs couches
    avgPH = 0.
    avgOrg = 0.
    
    texture = []
    
    print("|::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::")
    # On parcourt les couches, maximum 3 couches, la 4ème est ignorée !
    for l in range(0,min(nbLayer,3)):
        print("----------")
        print("| Profile:    {}".format(profileCode))
        print("| Layer:      {}".format(l+1))
        print("| Ph:         {}".format(soil.phs[l]))
        print("| Organica:   {}".format(soil.organics[l]))
        print("| Textura:    {}".format(soil.textures[l]))
        print("----------")
        avgPH += soil.phs[l]
        avgOrg += soil.organics[l]
        
        
        for t in soil.textures[l]:
            texture.append(int(t))
        
        
    # Calcul des moyennes
    avgPH = avgPH / float(nbLayer)
    avgOrg = avgOrg / float(nbLayer)
    
    # Extrapolation texture
    temp = []
    # si 1 couche on tripple la première
    if len(texture) == 5:
        texture = texture + texture + texture
    # Si 2 couches on double la dernière
    elif len(texture) == 10:
        texture = texture + texture[5:]
    
    
    
    
    
    
    print("-------------------------------------------------")
    print("Moyennes:    PH: {}                   Org: {}".format(avgPH, avgOrg))
    print("Texture:       {}".format(texture))
    print("Type for avgs: {}".format(type(avgOrg)))
    
    
    #dfCafe.set_value(index,'SICA',row['SICA'][:11])
    
    values = [avgPH,avgOrg]
    for e in texture:
        values.append(e)
    
    
    
    for idx, e in zip(range(0,len(soil_columns_names)), soil_columns_names):
        #df[e] = 0
        df.set_value(index,e,values[idx])
    
df
# supprimer les données incomplètes toDelete
df = df.drop(toDelete)
df = df.reset_index(drop=True)

|::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
----------
| Profile:    66CHIBAL
| Layer:      1
| Ph:         5.55
| Organica:   0.0285
| Textura:    [ 1.  1.  1.  0.  0.]
----------
----------
| Profile:    66CHIBAL
| Layer:      2
| Ph:         5.7
| Organica:   0.015
| Textura:    [ 1.  1.  1.  0.  0.]
----------
-------------------------------------------------
Moyennes:    PH: 5.625                   Org: 0.02175
Texture:       [1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0]
Type for avgs: <type 'float'>
|::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
----------
| Profile:    66CHIBAL
| Layer:      1
| Ph:         5.55
| Organica:   0.0285
| Textura:    [ 1.  1.  1.  0.  0.]
----------
----------
| Profile:    66CHIBAL
| Layer:      2
| Ph:         5.7
| Organica:   0.015
| Textura:    [ 1.  1.  1.  0.  0.]
----------
-------------------------------------------------
Moyennes:    PH: 5.625                   Org: 0.02175
Texture:       [1, 1, 1, 

# Création du nouveau CSV


In [6]:
print(df.shape)
df.to_csv("DataRisaralda_v1_utf-8.csv", sep=',', encoding='utf-8')
df.to_csv("DataRisaralda_v1_iso-8859-1.csv", sep=',', encoding='iso-8859-1')

(975, 139)


In [7]:
len(df.columns)

139

# Analyse des données de sol temporaires 

In [8]:
df['pH_avg'][2]


5.3500000000000005

In [9]:
#print depthinfo
nbLayer = []
for e in depthinfo:
    nbLayer.append(len(e))

nbLayer = np.array(nbLayer)
nbLayer = np.sort(nbLayer)

plt.plot(nbLayer)

unique, counts = np.unique(nbLayer, return_counts=True)
print "Nombre de couches / rows"
print unique
print counts

NameError: name 'depthinfo' is not defined

In [None]:
for e in depthinfo:
    #print e -> [('0', '40', '40'), ('40', '70', '30'), ('70', '100', '30')]
    count = 0
    if len(e) == 1:
        print e[0][1]
    for tpl in e:
        count += 1
        #if tpl[1] == '100':
            #print("Couche 1 mètre: {}".format(count))
        

Les 1 mètres se trouvent toujours entre les couches 1 et 3, jamais la couche 4 ! On peut ignorer la couche 4. 

Les profils de sol qui n'ont qu'une couche: la couche a une profondeur de 100 dans la majorité des cas, parfois 70

