In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import sqrt

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
df_plan_metro = pd.read_csv('plan du métro.csv')
df_position_gps = pd.read_csv('position gps des stations de métro.csv')
df_passagers = pd.read_csv('passagers.csv')
df_sup_stations = pd.read_csv('position gps manquantes.csv', sep=';')

# Data preparation
We create the following datasets:
- station_coords: positions gps des gares
- trajets: trajets des passagers qui vont d'une gare A à un gare B avec leurs coordonnées GPS.

## Create a dataframe of stations gps coordinates

In [4]:
position_gps = df_position_gps.copy()

position_gps['latitude'] = df_position_gps['GPS'].str.split(',').str[0]
position_gps['longitude'] = df_position_gps['GPS'].str.split(',').str[1]

position_gps.drop('GPS', axis=1, inplace=True)
position_gps = position_gps.drop_duplicates(subset="Station", keep="first")

position_gps.head()

Unnamed: 0,Station,latitude,longitude
0,Boucicaut,48.84109405724891,2.287945766487378
1,Bourse,48.86865415183625,2.3413760511435733
2,Bir-Hakeim,48.85394260621378,2.2893345327788075
3,Argentine,48.87533684211653,2.290127508824701
4,Daumesnil,48.83954964556868,2.3957032894363937


In [5]:
position_gps.shape

(261, 3)

In [6]:
position_gps.dtypes

Station      object
latitude     object
longitude    object
dtype: object

## Create a dataframe with passengers route informations

In [7]:
# Illustration of missing gps coordinates
print('On a les coord gps de ' + str(position_gps.groupby('Station').count().shape[0]) + ' stations')
print('Or il y a en tout ' + str(df_passagers.groupby('vers').count().shape[0]) + ' stations')

On a les coord gps de 261 stations
Or il y a en tout 307 stations


In [8]:
sup_stations = df_sup_stations.copy()
sup_stations.drop('ligne', axis=1, inplace=True)
station_coords = pd.concat([position_gps, sup_stations], ignore_index=True)
station_coords['latitude'] = station_coords['latitude'].astype(float)
station_coords['longitude'] = station_coords['longitude'].astype(float)
station_coords.head()

Unnamed: 0,Station,latitude,longitude
0,Boucicaut,48.841094,2.287946
1,Bourse,48.868654,2.341376
2,Bir-Hakeim,48.853943,2.289335
3,Argentine,48.875337,2.290128
4,Daumesnil,48.83955,2.395703


In [9]:
# Add the gps coordinates to the travels
trajets = df_passagers.copy()

trajets = pd.merge(trajets, station_coords, left_on="de", right_on="Station", how="left")
trajets.rename(columns={'latitude': 'latitude_depart', 'longitude': 'longitude_depart'}, inplace=True)
trajets.drop(columns=["Station"], inplace=True)

trajets = pd.merge(trajets, station_coords, left_on="vers", right_on="Station", how="left")
trajets.rename(columns={'latitude': 'latitude_arrivee', 'longitude': 'longitude_arrivee'}, inplace=True)
trajets.drop(columns=["Station"], inplace=True)

trajets.rename(columns={'de': 'station_depart', 'vers': 'station_arrivee'}, inplace=True)

trajets.head()

Unnamed: 0,station_depart,station_arrivee,nombre,latitude_depart,longitude_depart,latitude_arrivee,longitude_arrivee
0,La Défense,Esplanade de La Défense,7,48.892187,2.237018,48.888138,2.249793
1,La Défense,Pont de Neuilly,5,48.892187,2.237018,48.884708,2.260515
2,La Défense,Les Sablons,6,48.892187,2.237018,48.881016,2.272239
3,La Défense,Porte Maillot,6,48.892187,2.237018,48.877938,2.283914
4,La Défense,Argentine,5,48.892187,2.237018,48.875337,2.290128


In [10]:
#Vérification des valeurs manquantes
lignes_nan = trajets[trajets.isna().any(axis=1)]
lignes_nan.shape[0]

0

### Add lines to station coords

In [11]:
def get_index(station_name):
  return station_coords[station_coords['Station'] == station_name].index[0]

def get_station_name(index):
  return station_coords.iloc[index,0]

In [12]:
num_stations = station_coords.shape[0]
num_lines = 14
print(f"There is {num_stations} stations in the metro network.")

There is 307 stations in the metro network.


In [13]:
# Fonction personnalisée pour la conversion
def convert_metro_line(val):
    # Si 'bis' est dans la valeur, retirer 'bis' et convertir en entier
    if 'bis' in val:
        return int(val.replace('bis', '').strip())
    # Sinon, convertir directement en entier
    return int(val)

In [14]:
metro_de = df_plan_metro.copy()
metro_de = metro_de.iloc[:,0:2]
metro_de = metro_de.rename(columns={'de Ligne':'Ligne', 'de Station':'Station'})

metro_vers = df_plan_metro.copy()
metro_vers = df_plan_metro.iloc[:,2:4]
metro_vers = metro_vers.rename(columns={'vers Ligne':'Ligne', 'vers Station':'Station'})

metro_line = pd.concat([metro_de,metro_vers],axis=0)
metro_line.drop_duplicates(subset=['Ligne','Station'],keep='first',ignore_index=True, inplace=True)
metro_line['Ligne'] = metro_line['Ligne'].apply(convert_metro_line)
print(metro_line)

     Ligne                         Station
0        1      Charles de Gaulle - Étoile
1        1           Franklin D. Roosevelt
2        1     Champs-Élysées - Clemenceau
3        1                        Concorde
4        1  Palais-Royal - Musée du Louvre
..     ...                             ...
407      2             Porte de Versailles
408      1                  Les Courtilles
409      1        Basilique de Saint-Denis
410      8    Saint-Denis - Porte de Paris
411      6           Châtillon - Montrouge

[412 rows x 2 columns]


In [15]:
station_line_list =[[] for _ in range(num_stations)]
for i in range (metro_line.shape[0]):
    line = metro_line.iloc[i,0]
    station = metro_line.iloc[i,1]
    station_index = get_index(station)
    station_line_list[station_index].append(line)

In [16]:
#station_coords = pd.merge(station_coords, metro_line, left_on="Station", right_on="Station", how="left")
station_coords['lines'] = station_line_list
station_coords.head()

Unnamed: 0,Station,latitude,longitude,lines
0,Boucicaut,48.841094,2.287946,[8]
1,Bourse,48.868654,2.341376,[3]
2,Bir-Hakeim,48.853943,2.289335,[6]
3,Argentine,48.875337,2.290128,[1]
4,Daumesnil,48.83955,2.395703,"[6, 8]"


In [17]:
station_line_matrix = np.zeros((num_stations, num_lines)) 
for i in range (metro_line.shape[0]):
    line = metro_line.iloc[i,0]
    station = metro_line.iloc[i,1]
    station_index = get_index(station)
    station_line_matrix[station_index,line-1] == 1

# Trouver le trajet optimal entre deux gares

A ce stade on a:
- station_coords: dataframe (station, latitude, longitude)
- trajets: dataframe (passagers, station de départ, station d'arrivée avec leur coords)
- df_plan_metro: dataframe(connexion entre chaque gare et correspondance) pas encore utilisé mais il faudrait faire une "matrice d'adjacence"

### Add line information
L'objectif est de créer edge_index, et un np.array nb_stations*nb_ligne avec un 1 si la station appartient à la ligne

Metro line:

In [18]:
def get_station_line_of(station_index):
  """return: list(int), lines of the station"""
  return station_line_list[station_index]

In [19]:
def distance(station_index1, station_index2):
  lat1 = station_coords.iloc[station_index1,1]
  lon1 = station_coords.iloc[station_index1,2]
  lat2 = station_coords.iloc[station_index2,1]
  lon2 = station_coords.iloc[station_index2,2]
  return sqrt((lat1-lat2)**2 + (lon1-lon2)**2)

Distance matrix:

In [20]:
distance_matrix = np.zeros((num_stations,num_stations))
for i in range(num_stations):
  for j in range(num_stations):
    distance_matrix[i,j] = distance(i,j)

Adjency matrix:

In [21]:
station_single_line = df_plan_metro[df_plan_metro['de Station'] != df_plan_metro['vers Station']]
station_single_line['de Ligne'] = df_plan_metro['de Ligne']

m = station_single_line.shape[0]
adjency_matrix = np.zeros((m,m))
for i in range(m):
  index_depart = get_index(station_single_line.iloc[i,1])
  index_arrivee = get_index(station_single_line.iloc[i,3])
  adjency_matrix[index_depart,index_arrivee] = 1
  adjency_matrix[index_arrivee,index_depart] = 1

In [22]:
def shared_line(station1,i):
    for line_1 in station_line_list[station1]:
        for line_i in station_line_list[i]:
            if line_1 == line_i:
                return line_1

In [34]:
def get_adjacent_stations(station1, traveled_distance, station2, route):
  """
  station1: int, departure station index
  traveled_distance: float, distance parcourue pour arriver à station1
  station2: int, arrival station index
  route: list, stations visitées pour arriver à station1
  """

  adjacent_stations = pd.DataFrame(columns=['station', 'traveled_distance','distance_to_station2','cout','route','sation_name'])


  for i in range(num_stations):
    #Select the stations adjacent to station1
    if adjency_matrix[station1,i] != 0:
      route_i= route.copy()
      #Compute distances from station1 to station i and station2
      distance_to_station2 = distance_matrix[i,station2]
      new_traveled_distance = traveled_distance + distance_matrix[station1,i]


      # add a malus if a line change is needed
      if route_i.iloc[route_i.shape[0]-1,1] in get_station_line_of(i):
        new_line = route_i.iloc[route_i.shape[0]-1,1]
      else:
        new_line = shared_line(station1,i)
        new_traveled_distance += 0.1 # the distance between two station is around 0.01 (around 1 minute), we suppose that a line change is around 10 minutes, that's why we add 0.1 distance
        
      #Add the station to the route_i
      route_i.loc[len(route_i)] = [i, new_line]

      #Add station i and his parameters to potential next stations to explore
      adjacent_stations = pd.concat([adjacent_stations,
                                     pd.DataFrame({'station': [i],
                                                   'traveled_distance': [new_traveled_distance.astype(float)],
                                                   'distance_to_station2': [distance_to_station2.astype(float)],
                                                   'cout': [(new_traveled_distance + distance_to_station2).astype(float)],
                                                   'route': [route_i]
                                                   })], ignore_index=True)

  return adjacent_stations

In [35]:
def trouver_trajets(station1, station2, traveled_distance, route, next_stations):
  """
  station1: int, departure station index
  station2: int, arrival station index
  traveled_distance: float, distance traveled to reach station1
  route: list, stations already seen
  next_stations: dataframe, stations to visit with the traveled distance to reach them
  """

  adjacent_stations = get_adjacent_stations(station1, traveled_distance, station2, route)

  #add the adjacent stations to station to visit
  new_next_stations = pd.concat([next_stations, adjacent_stations], ignore_index=True).sort_values(by='cout',ascending=True,ignore_index=True).drop_duplicates('station', keep="first")

  # compute the station that bring the closest to station2
  new_station1= new_next_stations.iloc[0,0]
  new_traveled_distance = new_next_stations.iloc[0,1]
  new_route = new_next_stations.iloc[0,4]

  #drop the new station from next stations possible
  new_next_stations = new_next_stations.drop(index = new_next_stations[new_next_stations['station'] == new_station1].index) #maybe just delete index0

  if new_station1 == station2:
    new_route['station'] = new_route['station'].apply(get_station_name)
    return new_route
  
  #TO DELETE
  #if new_route.shape[0] > 3:
  # return new_route
  
  else:
    return trouver_trajets(new_station1, station2, new_traveled_distance, new_route, new_next_stations)

In [36]:
next_stations_init = pd.DataFrame(columns=['station', 'traveled_distance','distance_to_station2', 'cout','route'])
route = pd.DataFrame(columns=['station', 'line'])
station_index1 = get_index('Bérault')
station_index2 = get_index('Champs-Élysées - Clemenceau')

In [26]:
station_index1 = get_index('Boulogne - Pont de Saint-Cloud')
station_index2 = get_index('Jaurès')

trouver_trajets(station_index1, station_index2,0,pd.DataFrame({'station': station_index1, 'line': get_station_line_of(station_index1)[0]}, index=[0]),next_stations_init)

Unnamed: 0,station,line
0,Boulogne - Pont de Saint-Cloud,10
1,Boulogne - Jean Jaurès,10
2,Porte d'Auteuil,10
3,Michel-Ange - Molitor,10
4,Michel-Ange - Auteuil,10
5,Jasmin,9
6,Ranelagh,9
7,La Muette,9
8,Rue de la Pompe,9
9,Trocadéro,9


In [28]:
def predict_route(station1_name,station2_name):
    next_stations_init = pd.DataFrame(columns=['station', 'traveled_distance','distance_to_station2', 'cout','route'])
    station_index1 = get_index(station1_name)
    station_index2 = get_index(station2_name)
    return trouver_trajets(station_index1, station_index2,0,pd.DataFrame({'station': station_index1, 'line': get_station_line_of(station_index1)[0]}, index=[0]),next_stations_init)

In [29]:
predict_route('Bérault','Daumesnil')

Unnamed: 0,station,line
0,Bérault,1
1,Saint-Mandé,1
2,Porte de Vincennes,1
3,Nation,1
4,Picpus,6
5,Bel-Air,6
6,Daumesnil,6
