In [None]:
!pip install geocoder

In [None]:
import numpy as np 
import pandas as pd 
import os
import seaborn as sns
import geocoder
import folium

In [None]:
renfe= pd.read_csv("../input/spanish-high-speed-rail-system-ticket-pricing/thegurus-opendata-renfe-trips.csv")

# 1) Data cleaning:

### 1.1) Selecting columns:

In [None]:
renfe = renfe.filter(["origin", "destination", "departure", "duration",
             "vehicle_type","vehicle_class", "price", "fare"])

### 1.2) Removing NaN and zero values:

In [None]:
renfe.dropna(inplace=True)

In [None]:
renfe = renfe[(renfe['price'] != 0)]

### 1.3) Grouping data by the same type of travel:

The average price is going to be returned, hence, we are transfoming the time series data with millions of rows to just thousand of rows with not repeated kind of travels

In [None]:
renfe = renfe.groupby(['origin', 'destination', 'departure', 'duration', 'vehicle_type', 'vehicle_class', 'fare'], as_index=False)['price'].mean()

### 1.4) Removing outliers:

In [None]:
sns.boxplot(x= renfe['price']) 

As per the box plotgraph, it seems there are some outliers, however we should check this deeper

In [None]:
sum(renfe.price > 250) / len(renfe) * 100

It appears that only 0.0023% of the data is greater than the price equal to 250 EUR, hence we will delete the rows with values in which the price is greater than 250 EUR¶

In [None]:
renfe = renfe[renfe.price < 250]

# 2) Map of routes:

In [None]:
renfe = renfe.filter(["origin", "destination"])

Below we can see how often are the different travels, it seems the most frequent is between the big cities, such as between Madrid and Barcelona

In [None]:
renfe.filter(['origin', 'destination']).groupby(['origin', 'destination'], as_index=False).size().sort_values(['size'], ascending= False)[:5] 

In [None]:
route= renfe.filter(['origin', 'destination']).groupby(['origin', 'destination'], as_index=False).size().sort_values(['size'], ascending= False) 
origin= route['origin'].tolist()
destination= route['destination'].tolist()
or_dest_unique= route.origin.unique().tolist()

The below code extracts the coordinates of the different origin and destination cities

In [None]:
coord = []
for i in or_dest_unique:
    g = geocoder.osm(str(i +",  Spain"))
    coord.extend((str(i), g.osm['y'], g.osm['x']))

The below function creates four columns according to the origin and destination:
- Origin: origin_lng (longitude) and origin_lat (latitude)
- Destination: destination_lng (longitude) anddestination_lat (latitude)

In [None]:
df = pd.DataFrame(columns = ['origin_lng', 'origin_lat', 'destination_lng', 'destination_lat'])
for i in range(len(origin)):
    s = []
    if origin[i] in coord:
        s.append(coord[coord.index(origin[i]) + 1])
        s.append(coord[coord.index(origin[i]) + 2])
    if destination[i] in coord:
        s.append(coord[coord.index(destination[i]) + 1])
        s.append(coord[coord.index(destination[i]) + 2])
    df.loc[i] = s
for i in df.columns.tolist():
    df[str(i)] = pd.to_numeric(df[str(i)])

In [None]:
df.head()

After that we can create the map with the help of the package folium

In [None]:
centroid_lat = 40.208
centroid_lon = -3.713
m = folium.Map([centroid_lat, centroid_lon], zoom_start=5)
for i in range(len(df)):
    folium.CircleMarker(df.iloc[i][:2].tolist(),
                        radius=15,
                        fill_color="blue", 
                       ).add_child(folium.Popup(str(destination[i]))).add_to(m) # destination

    folium.CircleMarker(df.iloc[i][:2].tolist(),
                        radius=15,
                        fill_color="red", 
                       ).add_child(folium.Popup(str(origin[i]))).add_to(m) # origin

    folium.PolyLine([df.iloc[i][2:].tolist(), df.iloc[i][:2].tolist()],
               color="red").add_to(m)
    
m

One is able to interact with the map, it is possible to click the circles in order to see the name of the city and put more or less zoom.

As we can see in the above map the Spanish railway has radial shape with center in the capital, Madrid. On the other hand, when one city is darker indicates there are many connections there (origin and destination). It seems the cities with more connections are Madrid and Barcelona. 