# Calculate Straight-Line Distance for train

In [1]:
import requests
import pandas as pd
import re
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time

In [2]:
train = pd.read_csv("../data/landing/datasource-VIC_Govt_PTV-VIC_Govt_DELWP_datavic_PTV_METRO_TRAIN_STATION.csv")
train.head()

Unnamed: 0,FID,OBJECTID,STOP_ID,STOP_NAME,LATITUDE,LONGITUDE,TICKETZONE,ROUTES_USING_STOP,SHAPE,SE_ANNO_CAD_DATA
0,PTV_METRO_TRAIN_STATION.fid-3c3b5ed6_18344d972...,146,19970,Royal Park Railway Station (Parkville),-37.7812,144.9523,1,Upfield,POINT (-37.78119297 144.95230116),
1,PTV_METRO_TRAIN_STATION.fid-3c3b5ed6_18344d972...,147,19971,Flemington Bridge Railway Station (North Melbo...,-37.7881,144.9393,1,Upfield,POINT (-37.78813998 144.93932316),
2,PTV_METRO_TRAIN_STATION.fid-3c3b5ed6_18344d972...,148,19972,Macaulay Railway Station (North Melbourne),-37.7943,144.9362,1,Upfield,POINT (-37.794267 144.93616596),
3,PTV_METRO_TRAIN_STATION.fid-3c3b5ed6_18344d972...,149,19973,North Melbourne Railway Station (West Melbourne),-37.8074,144.9426,1,"Flemington,Sunbury,Upfield,Werribee,Williamsto...",POINT (-37.80741897 144.94257),
4,PTV_METRO_TRAIN_STATION.fid-3c3b5ed6_18344d972...,150,19974,Clifton Hill Railway Station (Clifton Hill),-37.7887,144.9954,1,"Mernda,Hurstbridge",POINT (-37.78865703 144.99541692),


In [3]:
def distance(lat1, lon1, lat2, lon2):

    """
    Calculate the approximate distance between two sets of latitude and longitude coordinates using a simplified method.

    Parameters:
    - lat1, lon1: Latitude and longitude of the first location.
    - lat2, lon2: Latitude and longitude of the second location.

    Returns:
    - float: The approximate distance between the two locations in kilometers.
    """
    
    lat_diff = (lat2 - lat1) * 111

    # 1 degree of longitude varies, we use an approximation based on the average latitude
    avg_lat = np.radians((lat1 + lat2) / 2.0)  # Convert average latitude to radians
    lon_diff = (lon2 - lon1) * 111 * np.cos(avg_lat)

    # Calculate the distance using the Pythagorean theorem on the lat_diff and lon_diff
    return np.sqrt(lat_diff**2 + lon_diff**2)


In [4]:
def find_nearest_train(house_lat, house_lon, train):

    """
    Find the nearest distance from a given house's latitude and longitude coordinates to nearby trains.

    Parameters:
    - house_lat: Latitude of the house.
    - house_lon: Longitude of the house.
    - train: DataFrame containing train station locations with 'LATITUDE' and 'LONGITUDE' columns.

    Returns:
    - float: The minimum distance in kilometers to the nearest train.
    """

    # Calculate the distance from the property to each school coordinate
    distances = train.apply(
        lambda row: distance(house_lat, house_lon, row['LATITUDE'], row['LONGITUDE']), 
        axis=1
    )

    # Return the closest distance
    return distances.min()

In [5]:
df = pd.read_csv('../data/raw/domain_outliers_removed.csv')

In [6]:
# Create another column which represents the distance to the nearest train station
df['NearestTrainDistance'] = df.apply(
    lambda row: find_nearest_train(row['Latitude'], row['Longitude'], train), 
    axis=1
)

# Output the csv file
df.to_csv("../data/raw/merge_requirement/Direct_train.csv")