In [None]:
import pandas as pd
import requests
import json
from datetime import datetime, timezone, timedelta

### Divide dataframe in 5 parts

Dividing file with needed routes into 5 files to make it possible to extract data through Google Distance Matrix API.

In [41]:
updated_city_names_df = pd.read_csv("datasets/new_geo_flights.csv")

In [42]:
# Calculate the number of files needed
num_files = (len(updated_city_names_df) + 299) // 300

# Split the DataFrame into chunks of 300 rows each
df_chunks = [updated_city_names_df.iloc[i:i+300] for i in range(0, len(updated_city_names_df), 300)]

# Save each chunk to a separate file
for i, chunk in enumerate(df_chunks):
    chunk.to_csv(f'datasets/flight_times3/flight_times_{i+1}.csv', index=False)

### Distance Matrix API

Extracting train data with Distance Matrix API

In [47]:
# Method for calling the API
def call_matrix_api(origins, destination, departure_time, mode='transit'):
    url = 'https://maps.googleapis.com/maps/api/distancematrix/json'
    key = 'AIzaSyD-QE-jWmJ6yhNrIjkhusUwxCaP46JPSww'  # INSERT API KEY !!!
    params = {'key': key, 
              'origins': origins, 
              'destinations': destination, 
              'mode': mode,
              'transit_mode': 'train',
              'departure_time': departure_time 
              }

    req = requests.get(url=url, params=params)
    response = json.loads(req.content)
    return response

# Convert '23.01.2024 07:00' to a UNIX timestamp
departure_time_str = '23.01.2024 08:00'
departure_time_obj = datetime.strptime(departure_time_str, '%d.%m.%Y %H:%M')
departure_time_obj_utc = departure_time_obj.replace(tzinfo=timezone.utc)
departure_time_unix = int(departure_time_obj_utc.timestamp())

# Read the location data
df = pd.read_csv('datasets/flight_times3/flight_times_5.csv')

# Get the list of destinations
destinations = df['arrival city'].value_counts().index

results = []

for destination in destinations:
    # Leave only rows with one destination
    df_temp = df[df['arrival city'] == destination]

    for index, row in df_temp.iterrows():
        origin = row['departure city']
        
        # Call the API for transit information (train mode)
        api_response = call_matrix_api(origin, destination, mode='transit', departure_time=departure_time_unix)

        # Extract transit information
        if api_response['status'] == 'OK':
            for row in api_response['rows']:
                for element in row['elements']:
                    if element['status'] == 'OK':
                        transit_distance = element.get('distance', {}).get('value', -1)
                        transit_time_seconds = element.get('duration', {}).get('value', -1)
                        
                        # Convert transit time from seconds to minutes
                        #transit_time_minutes = transit_time_seconds / 60.0
                        
                        results.append([origin, destination, transit_distance, transit_time_seconds])
                    else:
                        results.append([origin, destination, -1, -1])
        else:
            results.append([origin, destination, -1, -1])

# Create a dataframe out of the results list
results_df = pd.DataFrame(results, columns=['departure city', 'arrival city', 'transit_distance [m]', 'transit_time [s]'])

# Export to CSV
results_df.to_csv('datasets/train_distances3/train_distances_5.csv', index=False)


### Combine data back in one file

Combine result train_distances files in 1

In [49]:
# List to store individual DataFrames
dfs = []

# Number of files
num_files = 5

# Reading and appending each CSV file to the list
for i in range(1, num_files + 1):
    file_path = f'datasets/train_distances3/train_distances_{i}.csv'
    df_chunk = pd.read_csv(file_path)
    dfs.append(df_chunk)

# Concatenate the DataFrames
combined_df = pd.concat(dfs, ignore_index=True)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv('datasets/train_distances3/combined_train_distances3.csv', index=False)