### Utrecht in The Netherland
The dataset starts from 2017-01-01 to 2017-02-28.

In [None]:
import pandas as pd

# Specify the file path
file_path = 'datasets/journeys.tsv'

# Read the TSV file into a pandas DataFrame
df_utrecht = pd.read_csv(file_path, sep='\t', header=None)

# Remove the last column
df_utrecht = df_utrecht.drop(columns=df_utrecht.columns[-1])

# Set the column names
df_utrecht.columns = ["card_id", "start_time", "end_time", "origin", "destination"]

# df_utrecht['start_time'] = pd.to_datetime(df_utrecht['start_time'])
# df_utrecht['end_time'] = pd.to_datetime(df_utrecht['end_time'])

In [None]:
df_utrecht.head(10)

#### Number of Cardholders

In [None]:
print("Number of cardholders:", df_utrecht['card_id'].nunique())

In [None]:
utrecht_card_id_count = df_utrecht['card_id'].value_counts()
utrecht_card_id_count

In [None]:
import matplotlib.pyplot as plt

# Set the figure size
plt.figure(figsize=(6, 4))

# Plot the histogram
utrecht_card_id_count.plot(kind='hist', color='skyblue', edgecolor='black')

# Set the labels and title
plt.xlabel('Travel Frequency', fontsize=12)
plt.ylabel('Cardholders', fontsize=12)
plt.title('Distribution of Travel Frequency of Bus Passengers in Utrecht', fontsize=12)

# Set y-axis tick labels in scientific notation
plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0))

# Add grid lines
# plt.grid(True, linestyle='--', alpha=0.5)

# Save the figure as 'output.png'
plt.savefig('figs/utrecht_travel_freq.png')

# Show the plot
plt.show()


### Shenzhen Metro
The data spans from 2014-06-09 to 2014-06-13

In [None]:
import pandas as pd
import glob

# Specify the folder path
folder_path = 'raw data/DT_GJC_地铁站刷卡数据/'

# Get a list of all .csv files in the folder
csv_files = glob.glob(folder_path + '*.csv')

# Initialize an empty list to store the dataframes
dfs_SZ = []

# Loop through each .csv file and read it into a dataframe
for file in csv_files:
    df_SZ = pd.read_csv(file, encoding='gbk', header=None, sep=',')
    dfs_SZ.append(df_SZ)

# Concatenate the dataframes into a single dataframe
combined_df_SZ = pd.concat(dfs_SZ, ignore_index=True)

combined_df_SZ.columns = ["card_id", "fee", "actual_fee", "time", "route","station","train_id","status"]

# Convert the "time" column to timestamp
combined_df_SZ['time'] = pd.to_datetime(combined_df_SZ['time'])


#### Numer of total transactions

In [None]:
combined_df_SZ[combined_df_SZ.status == '进站']

In [None]:
print("Number of cardholders:", combined_df_SZ['card_id'].nunique())

In [None]:
card_id_count_SZ = combined_df_SZ[combined_df_SZ.status == '进站']['card_id'].value_counts()
card_id_count_SZ

In [None]:
# Set the figure size
plt.figure(figsize=(6, 4))

# Plot the histogram with more bins
card_id_count_SZ.plot(kind='hist', color='pink', edgecolor='black', bins=20)

# Set the labels and title
plt.xlabel('Travel Frequency', fontsize=12)
plt.ylabel('Cardholders', fontsize=12)
plt.title('Distribution of Travel Frequency of Bus Passengers in Shenzhen', fontsize=12)

# Set y-axis tick labels in scientific notation
plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0))

# Save the figure as 'output.png'
plt.savefig('figs/Shenzhen_travel_freq.png')

# Show the plot
plt.show()
