In [83]:
import csv
import pandas as pd
from datetime import datetime

def load_csv(csv_filename, columns, to_flatten, value):
    with open(csv_filename, 'r', newline='\n') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=';')
        
        data = {}
        header = {elt: index for index, elt in enumerate(next(csv_reader))}
        flattens = set()
        
        for row in csv_reader:
            key = tuple(row[header[column]] for column in columns)
            flattens.add(row[header[to_flatten]])
            try:
                data[key][row[header[to_flatten]]] = row[header[value]]
            except KeyError:
                data[key] = {row[header[to_flatten]]: row[header[value]]}
                
        df_dict = {elt:  [] for elt in columns}
        df_dict.update({elt: [] for elt in flattens})
    
        for key, values in data.items():
            for index, column in enumerate(columns):
                df_dict[column].append(key[index])
            for flatten in flattens:
                df_dict[flatten].append(values.get(flatten, 0)) 
        
        return pd.DataFrame(df_dict)

In [73]:
p_columns = ['LIBELLE_ARRET', 'CAT_JOUR', 'ID_REFA_LDA']
p_flatten = 'TRNC_HORR_60'
p_value = 'pourc_validations'

v_columns = ['LIBELLE_ARRET', 'JOUR', 'ID_REFA_LDA']
v_flatten = 'CATEGORIE_TITRE'
v_value = 'NB_VALD'

profile_s1 = load_csv('Data/validations-profils-horaires-2017s1.csv', p_columns, p_flatten, p_value)
profile_s2 = load_csv('Data/validations-profils-horaires-2017s2.csv', p_columns, p_flatten, p_value)

validation_s1 = load_csv('Data/validations-nombre-par-jour-2017s1.csv', v_columns, v_flatten, v_value)
validation_s2 = load_csv('Data/validations-nombre-par-jour-2017s2.csv', v_columns, v_flatten, v_value)

In [74]:
profile_s2

Unnamed: 0,0H-1H,10H-11H,11H-12H,12H-13H,13H-14H,14H-15H,15H-16H,16H-17H,17H-18H,18H-19H,...,4H-5H,5H-6H,6H-7H,7H-8H,8H-9H,9H-10H,CAT_JOUR,ID_REFA_LDA,LIBELLE_ARRET,ND
0,0.1,6.75,7.33,7.45,8.55,8.13,6.93,6.93,6.96,5.44,...,0.06,1.57,4.59,6.5,8.2,7.91,SAHV,68419,VAIRES-TORCY,0
1,0.07,7.42,7.41,7.7,7.65,7.15,7.38,6.75,6.18,5.42,...,0.08,1.87,4.91,6.36,8.02,7.9,SAVS,68419,VAIRES-TORCY,0
2,0.02,7.21,6.46,6.21,6.69,4.69,10.01,7.81,7.66,12.15,...,0,0.21,6.25,1.41,10.32,1.68,DIJFP,62471,VERNEUIL-L'ETANG,0
3,0,2.67,2.22,2.25,1.74,1.46,1.34,2.21,2.31,1.29,...,0.0,5.32,23.31,35.1,12.47,5.01,JOHV,62471,VERNEUIL-L'ETANG,0
4,0.0,2.9,3.14,3.03,2.92,2.24,2.26,2.51,2.61,2.14,...,0.0,4.75,20.51,31.81,12.47,4.88,JOVS,62471,VERNEUIL-L'ETANG,0
5,0,7.76,8.86,6.38,7.0,5.29,6.95,4.76,4.76,6.71,...,0,0,8.33,7.1,11.1,8.62,SAHV,62471,VERNEUIL-L'ETANG,0
6,0,7.07,7.97,5.53,8.0,5.03,6.57,4.83,4.33,5.77,...,0,0.1,9.1,6.77,11.4,8.2,SAVS,62471,VERNEUIL-L'ETANG,0
7,0,3.84,7.13,7.0,6.31,6.86,11.93,8.64,6.31,6.72,...,0,1.51,3.84,6.17,3.43,5.08,DIJFP,68311,VILLIERS-MONTBARBIN,0
8,0.05,4.71,3.36,3.61,2.49,3.89,3.46,4.21,2.62,2.12,...,0.05,2.09,17.75,26.3,10.09,11.12,JOHV,68311,VILLIERS-MONTBARBIN,0
9,0,5.99,4.18,3.37,4.33,6.09,5.79,6.09,4.23,3.17,...,0,3.22,14.95,21.44,7.4,5.84,JOVS,68311,VILLIERS-MONTBARBIN,0


## Profil type

In [75]:
def cleanProfile(df,CAT_JOUR,columnDrop, columnOrder,HOR):
    dfR=df.dropna()
    dfR=dfR.loc[dfR['CAT_JOUR'].isin(CAT_JOUR),:]
    dfR=dfR.loc[dfR['ID_REFA_LDA']!="?",:]
    for h in HOR:
     dfR[h]=dfR[h].astype(float)   
    #dfR['ID_REFA_LDA']=dfR.ID_REFA_LDA.astype(float)
    dfR=dfR.reset_index(drop=True)
    #dfR=dfR.drop(columns=columnDrop)
    #dfR=dfR[columnOrder]
    return dfR

CAT_JOUR=["JOHV"]
columnDrop=[]
columnOrder=[]
HOR=['0H-1H', '10H-11H', '11H-12H', '12H-13H', '13H-14H', '14H-15H',
       '15H-16H', '16H-17H', '17H-18H', '18H-19H', '19H-20H', '1H-2H',
       '20H-21H', '21H-22H', '22H-23H', '23H-0H', '2H-3H', '3H-4H', '4H-5H',
       '5H-6H', '6H-7H', '7H-8H', '8H-9H', '9H-10H']
profile_s1=cleanProfile(profile_s1,CAT_JOUR,columnDrop, columnOrder,HOR)
profile_s2=cleanProfile(profile_s2,CAT_JOUR,columnDrop, columnOrder,HOR)

In [76]:
counterror=0

profile=profile_s1.copy()
for i in range(profile_s1.shape[0]):
    ID_REFA_LDA=profile_s1.iloc[i,25]
    for h in range(24):
        pourc_validations_1=profile_s1.iloc[i,h]
        try:
            pourc_validations_2=profile_s2.loc[(profile_s2['ID_REFA_LDA']==ID_REFA_LDA),:].iloc[0,h]
        except Exception as e:
            counterror+=1
            print(e,counterror)
            pourc_validations_2=pourc_validations_1
        profile.iloc[i,h]=(pourc_validations_1+pourc_validations_2)/2

single positional indexer is out-of-bounds 1
single positional indexer is out-of-bounds 2
single positional indexer is out-of-bounds 3
single positional indexer is out-of-bounds 4
single positional indexer is out-of-bounds 5
single positional indexer is out-of-bounds 6
single positional indexer is out-of-bounds 7
single positional indexer is out-of-bounds 8
single positional indexer is out-of-bounds 9
single positional indexer is out-of-bounds 10
single positional indexer is out-of-bounds 11
single positional indexer is out-of-bounds 12
single positional indexer is out-of-bounds 13
single positional indexer is out-of-bounds 14
single positional indexer is out-of-bounds 15
single positional indexer is out-of-bounds 16
single positional indexer is out-of-bounds 17
single positional indexer is out-of-bounds 18
single positional indexer is out-of-bounds 19
single positional indexer is out-of-bounds 20
single positional indexer is out-of-bounds 21
single positional indexer is out-of-bounds 

## Validations Number

In [80]:
validation=pd.concat([validation_s1,validation_s2])

In [81]:
validation.head()

Unnamed: 0,AMETHYSTE,AUTRE TITRE,FGT,ID_REFA_LDA,IMAGINE R,JOUR,LIBELLE_ARRET,NAVIGO,NON DEFINI,TST
0,1310,349,2568,71426,3640,2017-03-17,BARBES-ROCHECHOUART,21335,0,3131
1,518,264,1298,71785,3283,2017-03-17,MAIRIE DE MONTREUIL,16509,0,1457
2,294,94,717,72168,1807,2017-03-17,MAIRIE DE SAINT-OUEN,9145,0,756
3,309,172,606,71909,2332,2017-03-17,MAIRIE DES LILAS,9796,0,542
4,352,134,391,70488,1916,2017-03-17,MAIRIE D'IVRY,5908,0,385


In [84]:
def week_number(row):
    d=datetime.strptime(row.JOUR, "%Y-%m-%d")
    return d.isocalendar()[1]
def week_day(row):
    d=datetime.strptime(row.JOUR, "%Y-%m-%d")
    return d.isocalendar()[2]
validation['WEEK_NUMBER']= validation.apply (lambda row: week_number (row),axis=1)
validation['WEEK_DAY']= validation.apply (lambda row: week_day (row),axis=1)

In [85]:
validation.head()

Unnamed: 0,AMETHYSTE,AUTRE TITRE,FGT,ID_REFA_LDA,IMAGINE R,JOUR,LIBELLE_ARRET,NAVIGO,NON DEFINI,TST,WEEK_NUMBER,WEEK_DAY
0,1310,349,2568,71426,3640,2017-03-17,BARBES-ROCHECHOUART,21335,0,3131,11,5
1,518,264,1298,71785,3283,2017-03-17,MAIRIE DE MONTREUIL,16509,0,1457,11,5
2,294,94,717,72168,1807,2017-03-17,MAIRIE DE SAINT-OUEN,9145,0,756,11,5
3,309,172,606,71909,2332,2017-03-17,MAIRIE DES LILAS,9796,0,542,11,5
4,352,134,391,70488,1916,2017-03-17,MAIRIE D'IVRY,5908,0,385,11,5


In [88]:
JOHV=[1,2,3,4,5]
Vacances=[6,7,14,15,28,29,30,31,32,33,34,35,43,44,52]
Jours_feries=['2017-01-01','2017-04-16','2017-04-17','2017-05-01','2017-05-08','2017-05-25','2017-06-05','2017-07-14','2017-08-15','2017-11-01','2017-11-11','2017-12-25']
validation=validation.loc[validation['WEEK_DAY'].isin(JOHV),:]
validation=validation.loc[~validation['WEEK_NUMBER'].isin(Vacances),:]