In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 

In [2]:
df = pd.read_excel('data/PASSENGER_RECORD.xlsx')

In [3]:
df.columns

Index(['PR_ID', 'PPID', 'TRAIN_TYPE', 'TRAIN_CODE', 'BOARD_DATE', 'BOARD_TIME',
       'ARRIVAL_DATE', 'ARRIVAL_TIME', 'START_STA', 'ARRIVAL_STA',
       'TRAVEL_TIME', 'TRAVEL_LENGTH', 'SEAT_TYPE', 'COACH_NO', 'SEAT_NO',
       'BUYYER_PID'],
      dtype='object')

车厢号、座位号、座位类别参考意义不大，直接drop

In [4]:
df = df.drop(['TRAIN_TYPE','TRAIN_CODE','COACH_NO','SEAT_NO','SEAT_TYPE'], axis=1)
df.columns

Index(['PR_ID', 'PPID', 'BOARD_DATE', 'BOARD_TIME', 'ARRIVAL_DATE',
       'ARRIVAL_TIME', 'START_STA', 'ARRIVAL_STA', 'TRAVEL_TIME',
       'TRAVEL_LENGTH', 'BUYYER_PID'],
      dtype='object')

BOARD_DATE、BOARD_TIME日期和时间是分离的，把它们合并到同一列上，ARRIVAL同理

In [5]:
df['BOARD_TIME'] = df['BOARD_DATE'] + df['BOARD_TIME'].apply(str).apply(pd.Timedelta)
df = df.drop(['BOARD_DATE'], axis=1)
df['BOARD_TIME']

0        2012-03-08 19:04:00
1        2012-03-12 08:01:00
2        2012-03-12 18:42:00
3        2012-03-01 08:03:00
4        2012-03-01 20:13:00
                 ...        
985754   2012-05-04 03:21:00
985755   2012-05-21 17:07:00
985756   2012-05-26 18:37:00
985757   2012-06-19 12:44:00
985758   2012-06-04 11:59:00
Name: BOARD_TIME, Length: 985759, dtype: datetime64[ns]

In [6]:
# ARRIVAL虽然看上去和BOARD一样，但实际上读取到的类型是datetime.time，所以做一个特殊的parser来处理，不能直接apply(str)
parser = lambda time : time.strftime('%H:%M:%S')
df['ARRIVAL_TIME'] = df['ARRIVAL_DATE'] + df['ARRIVAL_TIME'].apply(parser).apply(pd.Timedelta)
df = df.drop(['ARRIVAL_DATE'], axis=1)
df['ARRIVAL_TIME']

0        2012-03-08 20:04:00
1        2012-03-12 09:01:00
2        2012-03-12 19:42:00
3        2012-03-01 09:03:00
4        2012-03-01 21:13:00
                 ...        
985754   2012-05-04 03:21:00
985755   2012-05-21 19:07:00
985756   2012-05-27 02:37:00
985757   2012-06-19 12:44:00
985758   2012-06-04 17:59:00
Name: ARRIVAL_TIME, Length: 985759, dtype: datetime64[ns]

In [7]:
df.groupby('PPID')['TRAVEL_LENGTH'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
PPID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
110103195807011008,56.0,670.714286,746.941198,0.0,89.00,268.5,1209.75,2343.0
110105196708252992,50.0,662.000000,397.694068,40.0,269.50,738.0,964.00,1241.0
110108195508280000,45.0,580.288889,408.452268,0.0,300.00,499.0,796.00,1408.0
110222199103150000,426.0,175.469484,132.810773,19.0,154.00,154.0,154.00,2420.0
110224198701275008,457.0,226.719912,267.501755,20.0,154.00,154.0,154.00,1691.0
...,...,...,...,...,...,...,...,...
65312419920615332X,43.0,281.395349,273.697708,0.0,6.00,266.0,426.00,1428.0
65412119900226066X,614.0,132.724756,141.314434,0.0,65.00,123.0,140.00,1537.0
65412319911204248X,56.0,331.625000,237.186659,3.0,138.75,299.0,472.75,946.0
65420119940522003X,41.0,403.975610,359.110323,14.0,117.00,277.0,694.00,1132.0


In [8]:
df['BOARD_TIME'].max() - df['BOARD_TIME'].min()

Timedelta('361 days 23:57:00')

数据为11076名乘客一年内的购票记录

In [10]:
bins = np.linspace(df['TRAVEL_LENGTH'].min(), df['TRAVEL_LENGTH'].max(), 15)
travel_length_cat = pd.cut(df['TRAVEL_LENGTH'], bins)
travel_length_cat.value_counts().plot(kind='bar')

(0.0, 231.357]          612835
(231.357, 462.714]      150576
(462.714, 694.071]       80667
(694.071, 925.429]       66871
(925.429, 1156.786]      36337
(1156.786, 1388.143]     16334
(1388.143, 1619.5]        6944
(1619.5, 1850.857]        4255
(2082.214, 2313.571]      2428
(1850.857, 2082.214]      1960
(3007.643, 3239.0]         577
(2313.571, 2544.929]       526
(2776.286, 3007.643]       203
(2544.929, 2776.286]       132
Name: TRAVEL_LENGTH, dtype: int64

- 出发站、到达站的填充：可以用列车编号、到达站（出发站）、运行时间完全相同的数据来填充
- 百度地图API转经纬度
- DBSCAN Isolation Forest

- 乘车时间间隔（设置阈值划分长途和短途）
- 乘车频率（可以按月统计）
- Num of boarding stations（可以看最常去的所占的比例）
- 夜间乘车次数
- 座位种类