In [None]:
import numpy as np
import pandas as pd
import json

In [None]:
df = pd.read_csv("sample_taxi.csv")
df

The latitude range of Shenzhen:
22°27'N ~ 22°52'N
The longitude range of Shenzhen:
113°46'E ~ 114°37'E

In [None]:
df_illegal_lon = df[(~df["lon"].between(73, 135)) | (~df["lat"].between(3, 50))]
df_illegal_lon

In [None]:
def time_to_sec(time_str):
    h, m, s = map(int, time_str.split(':'))
    return h * 3600 + m * 60 + s

In [None]:
df = df[df["lat"].between(3, 50) & df["lon"].between(73, 135)]
df.loc[:, 'total_seconds'] = df['time'].apply(time_to_sec)
df

In [None]:
df_with_passenger = df[df["is_passenger"] == 1]
df_without_passenger = df[df["is_passenger"] == 0]

In [None]:
# Sort DataFrame by 'total_seconds' column
df_sorted_with_passenger = df_with_passenger.sort_values('total_seconds')
df_sorted_without_passenger = df_without_passenger.sort_values('total_seconds')

In [None]:
# Group DataFrame by 'taxi_id' and convert to array of dictionaries
grouped_data_with_passenger = df_sorted_with_passenger.groupby('taxi_id').apply(lambda x: {
    'taxi_id': x['taxi_id'].iloc[0],
    'path': x[['lon', 'lat']].values.tolist(),
    'passenger': x['is_passenger'].iloc[0],
    'timestamps': x['total_seconds'].values.tolist()
}).values

grouped_data_without_passenger = df_sorted_without_passenger.groupby('taxi_id').apply(lambda x: {
    'taxi_id': x['taxi_id'].iloc[0],
    'path': x[['lon', 'lat']].values.tolist(),
    'passenger': x['is_passenger'].iloc[0],
    'timestamps': x['total_seconds'].values.tolist()
}).values

In [None]:
grouped_data = np.hstack((grouped_data_with_passenger, grouped_data_without_passenger))

In [None]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        if isinstance(obj, np.bool_):
            return bool(obj)
        return super(NpEncoder, self).default(obj)


# Define the output file path
output_file = 'taxi_trips.json'

# Convert the grouped_data_serializable array to JSON format
json_data = json.dumps(grouped_data, cls=NpEncoder)

# Write the JSON data to the output file
with open(output_file, 'w') as file:
    file.write(json_data)

# Print a confirmation message
print(f"Data successfully written to '{output_file}'.")