In [1]:
import pandas as pd
import numpy as np

### 1. Codes suivant pour été run sur Pycharm afin d'utiliser la parallelisation.

Temps : 4 heures pour les gares et 18h pour les écoles

In [None]:
import numpy as np
import pandas as pd
import multiprocessing as mp
import time
import logging
import math
import geopy.distance

def distance2(lat1,lon1,lat2,lon2):
    """
    Calculate the Haversine distance.

    Parameters
    ----------
    lat1 : float
        latitude of the original point
    lon1 : float
        longitude of the original point
    lat2 : float
        latitude of the point of destination
    lon2 : float
        longitude of the point of destination
    Returns
    -------
    distance_in_km : float
    """
    radius = 6371  # km

    dlat = math.radians(lat2 - lat1)
    dlon = math.radians(lon2 - lon1)
    a = (math.sin(dlat / 2) * math.sin(dlat / 2) +
         math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) *
         math.sin(dlon / 2) * math.sin(dlon / 2))
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    d = radius * c

    return d

def CalculDistance_proc(q,df_orig : pd.DataFrame,df_dest : pd.DataFrame):
    """
    Function launched on an unique processor

    Parameters
    ----------
    df_orig : DataFrame
        DataFrame containing at least columns 'latitude' and 'longitude'
    df_dest : DataFrame
        DataFrame containing at least columns 'latitude' and 'longitude'
    Returns
    -------
    """
    res=pd.DataFrame(index=df_orig.index,columns=df_dest.index)
    start=time.time()
    for i in res.index:
        lat1=df_orig.loc[i,'latitude']
        lon1=df_orig.loc[i,'longitude']
        for j in res.columns:
            lat2=df_dest.loc[j,'latitude']
            lon2=df_dest.loc[j,'longitude']
            res.loc[i,j]=distance2(lat1,lon1,lat2,lon2)
    Letime=time.time()-start
    q.put(np.array([res,Letime], dtype=object))


def Distance_MultiProcess(df_origin :pd.DataFrame, df_destination :pd.DataFrame, nProc =2):
    """
    Function which splits the data and organize the processor according to the number nProc

    Parameters
    ----------
    df_orig : DataFrame
        DataFrame containing at least columns 'latitude' and 'longitude'
    df_dest : DataFrame
        DataFrame containing at least columns 'latitude' and 'longitude'
    Returns
    array df_final
        (df_final : DataFrame, exec_time: list)
    -------
    """
    ctx = mp.get_context('spawn')
    N = df_origin.shape[0]
    exec_time=[]
    Sorti=[]
    index_a = [N // nProc * i for i in range(nProc)]
    index_b = [N // nProc * (i + 1) for i in range(nProc)]
    if N % nProc != 0:
        index_b[-1]=N
    process = list()
    q_list = list()
    start = time.time()
    for p in range(nProc):
        q = ctx.Queue()
        q_list.append(q)
        logging.info("Main    : create and start process %d.", p)
        x = ctx.Process(target=CalculDistance_proc,
                        args=(q, df_origin.iloc[index_a[p]:index_b[p], :], df_destination))
        process.append(x)
        logging.info("start" + str(p))
        x.start()
    for q in q_list:
        temp = q.get()
        Sorti.append(temp[0])
        exec_time.append(temp[1])
        logging.info(str(temp[1]))
    for p, proc in enumerate(process):
        logging.info("Main    : before joining process %d.", p)
        proc.join()
        proc.close()
        logging.info("Main    : process %d done", p)

    for j in range(len(Sorti)):
       if j==0:
           df_final=Sorti[j]
       else:
           df_final=pd.concat([df_final,Sorti[j]],ignore_index=True)
    return np.array([df_final,exec_time], dtype=object)

In [None]:
if __name__ == '__main__':
    Type_Lancement='ecole' #ecole ou gare
    # Import les gares
    #df_gare=pd.read_csv('/Users/thomasdoucet/Documents/GitHub/ML_ENSAE/gare_France.csv',sep=',')
    #df_gare = pd.read_csv('C:/Users/NovaCloudUser/ML_ENSAE/gare_France.csv', sep=',')

    # Import les annonces
    #df_orig=pd.read_csv('/Users/thomasdoucet/Documents/GitHub/ML_ENSAE/X_train.csv',sep=',')
    df_orig = pd.read_csv('C:/Users/NovaCloudUser/ML_ENSAE/X_train.csv', sep=',')

    df_orig=df_orig[['approximate_latitude','approximate_longitude']]
    df_orig.columns = ['latitude', 'longitude']

    # Import les écoles
    #df_school=gpd.read_file('/Users/thomasdoucet/Documents/GitHub/ML_ENSAE/fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre.geojson')
    #df_school = gpd.read_file('D:/Téléchargements/fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre.geojson')

    if Type_Lancement == "gare":
        nDiv = 5
        df_dest = pd.read_csv('C:/Users/NovaCloudUser/ML_ENSAE/gare_France.csv', sep=',')
    else: # Cas école
        nDiv= 50
        df_dest = pd.read_csv(
            'D:/Téléchargements/fr-en-adresse-et-geolocalisation-etablissements-premier-et-second-degre.csv', sep=';')
        df_dest = df_dest[['Latitude', 'Longitude']].dropna()
        df_dest.columns = ['latitude', 'longitude']

    N=df_orig.shape[0]
    index_a = [N // nDiv * i for i in range(nDiv)]
    index_b = [N // nDiv * (i + 1) for i in range(nDiv)]
    if N // nDiv != 0:
        index_b[-1]=N

    for i in range(nDiv):
       start=time.time()
       res =Distance_MultiProcess(df_orig[index_a[i]:index_b[i]], df_dest, nProc =4)
       print("Temps d'éxécution pour gares (Division {}: {} secondes".format(i,round(time.time() - start, 2)))
       name='distance_'+Type_Lancement+str(i)+'.csv'
       res[0].to_csv('D:/Téléchargements/'+name)
       print(res[1])
       del res #Libérer de la mémoire


### 2. Ouverture des fichiers Distance pour Ecole et Gare, calcul des features, aggrégation des résultats et export pour exploitation.


In [28]:
#Pour les gares
for num in range(5):    
    path='D:/Téléchargements/distance_gare'+str(num)+'.csv'
    df=pd.read_csv(path)
    result=pd.DataFrame(index=df.index)
    result['Min']=df.iloc[:,1:].transpose().min()
    for i in [1,3,5,10,20]:
        name_col='nb_gare_'+str(i)+'kms'
        result[name_col]=(df.iloc[:,1:].transpose()<i).sum()
    if num==0:
        final=result
    else:
        final=pd.concat([final,result],ignore_index=True)
final
final.to_csv('D:/Téléchargements/DistanceGare.csv')

Unnamed: 0,Min,nb_gare_1kms,nb_gare_3kms,nb_gare_5kms,nb_gare_10kms,nb_gare_20kms
0,2.062215,0,2,4,9,24
1,1.235074,0,1,5,13,44
2,13.951685,0,0,0,0,5
3,2.813301,0,1,1,3,6
4,1.602086,0,4,8,22,51
...,...,...,...,...,...,...
37363,2.156128,0,1,2,5,13
37364,0.423113,9,102,235,615,986
37365,18.953534,0,0,0,0,1
37366,2.001041,0,3,9,16,72


In [37]:
# Pour les écoles

for num in range(50):
    print(num)
    path='D:/Téléchargements/distance_ecole'+str(num)+'.csv'
    df=pd.read_csv(path)
    result=pd.DataFrame(index=df.index)
    result['Min']=df.iloc[:,1:].transpose().min()
    for i in [1,3,5,10,20]:
        name_col='nb_ecole_'+str(i)+'kms'
        result[name_col]=(df.iloc[:,1:].transpose()<i).sum()
    if num==0:
        final=result
    else:
        final=pd.concat([final,result],ignore_index=True)
final
final.to_csv('D:/Téléchargements/DistanceEcole.csv')

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


In [38]:
final

Unnamed: 0,Min,nb_ecole_1kms,nb_ecole_3kms,nb_ecole_5kms,nb_ecole_10kms,nb_ecole_20kms
0,1.049957,0,17,53,227,653
1,0.180002,11,56,153,644,1155
2,0.667630,1,3,10,17,97
3,0.120038,2,2,5,28,142
4,0.980819,1,107,329,807,1141
...,...,...,...,...,...,...
37363,0.189290,12,57,94,146,241
37364,0.230631,53,500,962,2802,5716
37365,0.176353,4,7,7,15,24
37366,0.169916,5,30,58,160,804
