In [79]:
import pandas as pd

airport_cities = pd.read_csv("../datasets/AG_airport_cities_cleaned.csv")
train_routes  = pd.read_csv('../datasets/train_routes.csv')
train_cities = pd.read_csv("../datasets/train_cities.csv")
flight_times = pd.read_csv("../datasets/flight_times_extended.csv")
flight_times = flight_times.iloc[:,1:]
train_routes = train_routes.iloc[:,1:]

Map old names of the cities. Same for trains will be below developped

In [80]:
# airport cities + routes [should be matched with a iata code - need for identifier ]
airport_cities_mapping = airport_cities[["original_city","new_cleaned_city"]].copy()
airport_cities_mapping.rename(columns={"new_cleaned_city":"city_new"}, inplace=True )
airport_cities_mapping

Unnamed: 0,original_city,city_new
0,Andenes,Andenes
1,Moscow,Moscow
2,Prague,Prague
3,Stavropol,Stavropol
4,Craiova,Craiova
...,...,...
299,Krasnoyarsk,Krasnoyarsk
300,Syktyvkar,Syktyvkar
301,Pescara,Pescara
302,Bragança,Braganca


In [81]:
train_cities["city_new"] = train_cities["city"].apply(lambda x: x.split("/")[0].strip())
train_cities["city_new"] = train_cities["city_new"].apply(lambda x: x.split("(")[0].strip())

#character replacements
replacements = {
    "København":"Copenhagen",
    "Warszawa":"Warsaw",
    "Wien":"Vienna",
    "ü": "u",
    "Ł" : "L",
    "ł" : "l",
    "ä" : "a",
    "ö" : "o",
    "å" : "a",
    "ú" : "u",
    "ń" : "n",
    "ş" : "s",
    "é" : "e",
    "ú" : "u",
    "è":"e",
    "ø":"o",
    "ţ":"t",
    "õ":"o",
    "ś":"s",
    "í":"i"
}

for kchar, vchar in replacements.items():
    train_cities["city_new"] = train_cities["city_new"].apply(lambda x: x.replace(kchar, vchar)) 
# train_cities["city_new"] = train_cities["city_new"].apply(lambda x: x.replace(""))
train_cities = train_cities.loc[ ~(train_cities["city_new"] ==  "s-Hertogenbosch")]
train_cities_set =  set(train_cities["city_new"])
airport_cities_set = set(airport_cities_mapping["city_new"])
matches = train_cities_set & airport_cities_set

# Matchings after cleaning
matchings = pd.DataFrame( [{"Matches": len(matches),
                            "Airports only": len(airport_cities_set - train_cities_set),
                            "Train cities only":len(train_cities_set - airport_cities_set)
                }])
matchings

Unnamed: 0,Matches,Airports only,Train cities only
0,84,218,577


The cities which match (are in both set) will be used to subset train routes:

In [91]:
# find flight routes for Matches
flight_times_arrival = flight_times.merge(airport_cities_mapping, right_on="original_city", left_on= "arrival city")
flight_times_arrival.drop(["arrival city", "original_city"], axis=1, inplace=True)
flight_times_arrival.rename(columns={"city_new":"city2"}, inplace=True)

flight_times_departure = flight_times_arrival.merge(airport_cities_mapping, right_on="original_city", left_on= "departure city")
flight_times_departure.drop(["departure city", "original_city"], axis=1, inplace=True)
flight_times_departure.rename(columns={"city_new":"city1"}, inplace=True)

flight_times_new = flight_times_departure.loc[flight_times_departure["city1"].isin(matches) & flight_times_departure["city2"].isin(matches)]
# flight_times_new.to_csv("../datasets/flight_times_cleaned.csv",index=False)
flight_times_new

Unnamed: 0,route,time,departure airport,arrival airport,city2,city1
1,"{'ABZ', 'AMS'}",95.0,ABZ,AMS,Amsterdam,Aberdeen
19,"{'LTN', 'AMS'}",70.0,LTN,AMS,Amsterdam,Luton
21,"{'LTN', 'BCN'}",125.0,LTN,BCN,Barcelona,Luton
22,"{'LTN', 'BFS'}",75.0,LTN,BFS,Belfast,Luton
23,"{'LTN', 'EDI'}",75.0,LTN,EDI,Edinburgh,Luton
...,...,...,...,...,...,...
1403,"{'MMX', 'ARN'}",70.0,MMX,ARN,Stockholm,Malmo
1404,"{'MMX', 'BMA'}",80.0,MMX,BMA,Stockholm,Malmo
1443,"{'STR', 'SKG'}",135.0,STR,SKG,Thessaloniki,Stuttgart
1451,"{'TUF', 'STN'}",85.0,TUF,STN,London,Tours


In [89]:
# train routes for matches
train_routes_city1 = train_routes.merge(train_cities, right_on="city", left_on= "city_1")
train_routes_city1.drop(["city", "city_1"], axis=1, inplace=True)
train_routes_city1.rename(columns={"city_new":"city1"}, inplace=True)

train_routes_city2 = train_routes_city1.merge(train_cities, right_on="city", left_on= "city_2")
train_routes_city2.drop(["city", "city_2"], axis=1, inplace=True)
train_routes_city2.rename(columns={"city_new":"city2"}, inplace=True)

train_routes_new = train_routes_city2.loc[ train_routes_city2["city1"].isin(matches) & train_routes_city2["city2"].isin(matches)]
# train_routes_new.to_csv("../datasets/train_routes_cleaned.csv",index=False)
train_routes_new

Unnamed: 0,time_avg[h],time_opti[h],time_opti[min],time_avg[min],city1,city2
7,1.54,0.86,51.6,92.4,Paris,Tours
92,1.52,1.11,66.6,91.2,London,Birmingham
97,1.82,1.48,88.8,109.2,London,Bristol
98,1.66,1.38,82.8,99.6,Birmingham,Bristol
113,1.45,1.14,68.4,87.0,London,Southampton
115,0.79,0.44,26.4,47.4,Bournemouth,Southampton
126,0.6,0.35,21.0,36.0,London,Luton
846,1.99,1.43,85.8,119.4,Madrid,Zaragoza
847,2.05,1.47,88.2,123.0,Barcelona,Zaragoza
952,1.19,0.81,48.6,71.4,Bologna,Parma


In [92]:
# math routes
train_routes_new["route_new"] =  train_routes_new.apply(lambda row: str( set([row["city1"], row["city2"]]) ), axis=1)
flight_times_new["route_new"] =  flight_times_new.apply(lambda row: str( set([row["city1"], row["city2"]]) ), axis=1)
flight_times_new

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_routes_new["route_new"] =  train_routes_new.apply(lambda row: str( set([row["city1"], row["city2"]]) ), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flight_times_new["route_new"] =  flight_times_new.apply(lambda row: str( set([row["city1"], row["city2"]]) ), axis=1)


Unnamed: 0,route,time,departure airport,arrival airport,city2,city1,route_new
1,"{'ABZ', 'AMS'}",95.0,ABZ,AMS,Amsterdam,Aberdeen,"{'Amsterdam', 'Aberdeen'}"
19,"{'LTN', 'AMS'}",70.0,LTN,AMS,Amsterdam,Luton,"{'Luton', 'Amsterdam'}"
21,"{'LTN', 'BCN'}",125.0,LTN,BCN,Barcelona,Luton,"{'Luton', 'Barcelona'}"
22,"{'LTN', 'BFS'}",75.0,LTN,BFS,Belfast,Luton,"{'Luton', 'Belfast'}"
23,"{'LTN', 'EDI'}",75.0,LTN,EDI,Edinburgh,Luton,"{'Luton', 'Edinburgh'}"
...,...,...,...,...,...,...,...
1403,"{'MMX', 'ARN'}",70.0,MMX,ARN,Stockholm,Malmo,"{'Stockholm', 'Malmo'}"
1404,"{'MMX', 'BMA'}",80.0,MMX,BMA,Stockholm,Malmo,"{'Stockholm', 'Malmo'}"
1443,"{'STR', 'SKG'}",135.0,STR,SKG,Thessaloniki,Stuttgart,"{'Thessaloniki', 'Stuttgart'}"
1451,"{'TUF', 'STN'}",85.0,TUF,STN,London,Tours,"{'London', 'Tours'}"


In [93]:
flight_times_new.merge(train_routes_new, on="route_new")

Unnamed: 0,route,time,departure airport,arrival airport,city2_x,city1_x,route_new,time_avg[h],time_opti[h],time_opti[min],time_avg[min],city1_y,city2_y
0,"{'DUS', 'FRA'}",50.0,DUS,FRA,Frankfurt am Main,Dusseldorf,"{'Frankfurt am Main', 'Dusseldorf'}",1.91,1.42,85.2,114.6,Frankfurt am Main,Dusseldorf
1,"{'FRA', 'DUS'}",50.0,FRA,DUS,Dusseldorf,Frankfurt am Main,"{'Frankfurt am Main', 'Dusseldorf'}",1.91,1.42,85.2,114.6,Frankfurt am Main,Dusseldorf
2,"{'FRA', 'STR'}",50.0,FRA,STR,Stuttgart,Frankfurt am Main,"{'Stuttgart', 'Frankfurt am Main'}",1.69,1.29,77.4,101.4,Frankfurt am Main,Stuttgart
