In [None]:
import numpy as np
import pandas as pd
import json

In [None]:
df = pd.read_csv("sample_taxi.csv")
df

The latitude range of Shenzhen:
22°27'N ~ 22°52'N
The longitude range of Shenzhen:
113°46'E ~ 114°37'E

In [None]:
df_illegal_lon = df[(~df["lon"].between(73, 135)) | (~df["lat"].between(3, 50))]
df_illegal_lon

In [None]:
def time_to_sec(time_str):
    h, m, s = map(int, time_str.split(':'))
    return h * 3600 + m * 60 + s

In [None]:
df = df[df["lat"].between(3, 50) & df["lon"].between(73, 135)]
df.loc[:, 'total_seconds'] = df['time'].apply(time_to_sec)
df

In [None]:
df_sorted = df.sort_values(['taxi_id', 'total_seconds'])
df_sorted

清除载客状态与前后时间点均不同的数据 （e.g. 1110111 -> 111111)

In [None]:
passenger_cond1 = df_sorted['is_passenger'] != df_sorted['is_passenger'].shift(1)
passenger_cond2 = df_sorted['is_passenger'] != df_sorted['is_passenger'].shift(-1)
passenger_cond3 = df_sorted['taxi_id'] == df_sorted['taxi_id'].shift(1)
passenger_cond4 = df_sorted['taxi_id'] == df_sorted['taxi_id'].shift(-1)

# 标记异常数据
passenger_cond = passenger_cond1 & passenger_cond2 & passenger_cond3 & passenger_cond4
df_sorted = df_sorted[~passenger_cond]
df_sorted

In [None]:
from math import radians, sin, cos, sqrt, atan2

# Function to calculate the distance between two points using the Haversine formula
def calculate_distance_reasonable(id1, lat1, lon1, time1, id2, lat2, lon2, time2):
    R = 6371.0  # Earth's radius in kilometers

    if id1 != id2: return True

    time_delta = abs(time2 - time1)

    lat1_rad = radians(float(lat1))
    lon1_rad = radians(float(lon1))
    lat2_rad = radians(float(lat2))
    lon2_rad = radians(float(lon2))

    dlon = lon2_rad - lon1_rad
    dlat = lat2_rad - lat1_rad

    a = sin(dlat / 2)**2 + cos(lat1_rad) * cos(lat2_rad) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c

    reasonable = (distance / time_delta * 3600) <= 120
    return reasonable

In [None]:
# Calculate the distance between consecutive points and validate
df_sorted_shift = df_sorted.shift(1)
df_sorted['reasonable'] = df_sorted.apply(lambda row: calculate_distance_reasonable(row["taxi_id"], row['lat'], row['lon'], row["total_seconds"], df_sorted_shift['taxi_id'].loc[row.name], df_sorted_shift['lat'].loc[row.name], df_sorted_shift['lon'].loc[row.name], df_sorted_shift['total_seconds'].loc[row.name]), axis=1)

In [None]:
# reasonable_distance_min = 0  # Replace with your expected minimum distance
# reasonable_distance_max = 1  # Replace with your expected maximum distance

# Filter out the data points that are not reasonable
# cleaned_df = df_sorted[df_sorted['distance'].between(reasonable_distance_min, reasonable_distance_max) & passenger_cond3 & passenger_cond4]
cleaned_df = df_sorted[df_sorted['reasonable']]
cleaned_df

In [None]:
df = cleaned_df
df.to_csv("./output.csv")

In [None]:
df_with_passenger = df[df["is_passenger"] == 1]
df_without_passenger = df[df["is_passenger"] == 0]

In [None]:
# Sort DataFrame by 'total_seconds' column
df_sorted_with_passenger = df_with_passenger.sort_values('total_seconds')
df_sorted_without_passenger = df_without_passenger.sort_values('total_seconds')

In [None]:
df_sorted_with_passenger

In [None]:
# Group DataFrame by 'taxi_id' and convert to array of dictionaries
grouped_data_with_passenger = df_sorted_with_passenger.groupby('taxi_id').apply(lambda x: {
    'taxi_id': x['taxi_id'].iloc[0],
    'path': x[['lon', 'lat']].values.tolist(),
    'passenger': x['is_passenger'].iloc[0],
    'timestamps': x['total_seconds'].values.tolist()
}).values

grouped_data_without_passenger = df_sorted_without_passenger.groupby('taxi_id').apply(lambda x: {
    'taxi_id': x['taxi_id'].iloc[0],
    'path': x[['lon', 'lat']].values.tolist(),
    'passenger': x['is_passenger'].iloc[0],
    'timestamps': x['total_seconds'].values.tolist()
}).values

In [None]:
grouped_data = np.hstack((grouped_data_with_passenger, grouped_data_without_passenger))

In [None]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        if isinstance(obj, np.bool_):
            return bool(obj)
        return super(NpEncoder, self).default(obj)


# Define the output file path
output_file = 'taxi_trips.json'

# Convert the grouped_data_serializable array to JSON format
json_data = json.dumps(grouped_data, cls=NpEncoder)

# Write the JSON data to the output file
with open(output_file, 'w') as file:
    file.write(json_data)

# Print a confirmation message
print(f"Data successfully written to '{output_file}'.")