In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import xlrd
import time

In [2]:
# # start processing
# df = pd.read_excel('./data/PASSENGER_RECORD.xlsx')
# convert .xlsx file to .csv file for easy loading. Simply use MS Excel 'save as' function. 
df = pd.read_csv('./data/PASSENGER_RECORD.csv')

In [3]:
df.columns

Index(['PR_ID', 'PPID', 'TRAIN_TYPE', 'TRAIN_CODE', 'BOARD_DATE', 'BOARD_TIME',
       'ARRIVAL_DATE', 'ARRIVAL_TIME', 'START_STA', 'ARRIVAL_STA',
       'TRAVEL_TIME', 'TRAVEL_LENGTH', 'SEAT_TYPE', 'COACH_NO', 'SEAT_NO',
       'BUYYER_PID'],
      dtype='object')

In [4]:
# ['PR_ID', 'PPID'1, 'TRAIN_TYPE'1, 'TRAIN_CODE'1, 'BOARD_DATE', 'BOARD_TIME',
#        'ARRIVAL_DATE', 'ARRIVAL_TIME', 'START_STA'1, 'ARRIVAL_STA'1,
#        'TRAVEL_TIME', 'TRAVEL_LENGTH', 'SEAT_TYPE'1, 'COACH_NO', 'SEAT_NO',
#        'BUYYER_PID'1]

# Simple token features we use in our project are:
1. Passenger. PPID
    - Format: a integer indicates passenger id
2. Buyyer. BUYYER_PID
    - Format: a integer indicates buyyer id
3. Start Stations.  START_STA
    - Format: a integer indicates No. of Start Station
4. Arrival Stations.  ARRIVAL_STA
    - Format: a integer indicates No. of Arrival Station
5. Train Code. TRAIN_CODE
    - Format: a integer
6. Train Type. TRAIN_TYPE
    - Format: a integer
7. Seat type. SEAT_TYPE
    - Format: a integer 


In [5]:
simple_feature2idx = {'PPID':{},
                      'BUYYER_PID':{}, 
                      'START_STA':{}, 
                      'ARRIVAL_STA':{}, 
                      'TRAIN_CODE':{}, 
                      'TRAIN_TYPE':{}, 
                      'SEAT_TYPE':{}}

对每一列数据都建立一个value->idx的索引，重复值的索引是唯一的

In [6]:
# set look-up table
for feature in simple_feature2idx.keys():
    print("processing: {}".format(feature))
    for value in df[feature].apply(str).tolist():
        if value not in simple_feature2idx[feature]:
            simple_feature2idx[feature][value] = len(simple_feature2idx[feature])

processing: PPID
processing: BUYYER_PID
processing: START_STA
processing: ARRIVAL_STA
processing: TRAIN_CODE
processing: TRAIN_TYPE
processing: SEAT_TYPE


把df中的value全部转为index，用simple_feature_vectors存放

In [7]:
def tokenize_feature(all_features, feature2idx):
    index_list = [feature2idx[feat] for feat in all_features] 
    feature_vector = np.array(index_list)
    print(feature_vector.shape)
    return feature_vector

In [8]:
# set look-up table
simple_feature_vectors = {}
for feature in simple_feature2idx.keys():
    simple_feature_vectors[feature] = tokenize_feature(df[feature].apply(str).tolist(), simple_feature2idx[feature])

(985759,)
(985759,)
(985759,)
(985759,)
(985759,)
(985759,)
(985759,)


# Simple digit features we use in our project are:
1. Tranvel Time. TRAVEL_TIME
    - Format: a integer represent hours   
1. Tranvel Length. TRAVEL_LENGTH
    - Format: a integer represent distances   
3. Board Date.  BOARD_DATE
    - Format: a integer represent num of days after 2012/01/01 when boarding. 比如2012年1月2号就是1，2月15日就是30+15=45
4. Board Time. BOARD_TIME
    - Format: a interger represent num minutes from 0:0:0. 比如当日4点15分就是4*60+15=255
5. Arrival Date.  ARRIVAL_DATE
    - Format: a integer represent num of days after 2012/01/01 when boarding. 比如2012年1月2号就是1，2月15日就是30+15=45
6. Arrival Time.  ARRIVAL_TIME
    - Format: a interger represent num minutes from 0:0:0. 比如当日4点15分就是4*60+15=255
7. Seat Number.  SEAT_NO
    - Format: a integer 
8. Coach Number.  COACH_NO
    - Format: a integer 


In [9]:
digit_feature2idx = {'TRAVEL_TIME':{},
                     'TRAVEL_LENGTH':{},
                     'BOARD_DATE':{}, 
                     'BOARD_TIME':{}, 
                     'ARRIVAL_DATE':{}, 
                     'ARRIVAL_TIME':{}, 
                     'SEAT_NO':{}, 
                     'COACH_NO':{}}

处理数值型特征，travel_length, seat_no, coach_no这3个特征直接转成int即可

In [10]:
digit_feature_vectors = {}
for feature in digit_feature2idx.keys():
    if feature in ['TRAVEL_LENGTH', 'SEAT_NO', 'COACH_NO']:
        index_list = df[feature].apply(int).tolist()
        feature_vector = np.array(index_list)
        print("{}\t{}".format(feature, feature_vector.shape))
        digit_feature_vectors[feature] = feature_vector

TRAVEL_LENGTH	(985759,)
SEAT_NO	(985759,)
COACH_NO	(985759,)


时间数据，特殊处理，因为都是2012年一年内的数据，那么用2012-01-01 00:00:00作为一个初始值，将出发时间与到达时间都与这个初始时间作差


In [11]:
board_times = df['BOARD_DATE'].apply(pd.Timestamp) + df['BOARD_TIME'].apply(pd.Timedelta)
# parser = lambda time : time.strftime('%H:%M:%S')
# arrival_times = df['ARRIVAL_DATE'] + df['ARRIVAL_TIME'].apply(parser).apply(pd.Timedelta)  # 如果上面读取的是csv就不用搞parser
arrival_times = df['ARRIVAL_DATE'].apply(pd.Timestamp) + df['ARRIVAL_TIME'].apply(pd.Timedelta)
initial_timestamp = pd.Timestamp('2012-01-01T00')  # Timestamp('2012-01-01 00:00:00')

feature_vector = np.array((board_times - initial_timestamp).astype('timedelta64[D]').tolist())
print("{}\t{}".format('BOARD_DATE', feature_vector.shape))
digit_feature_vectors['BOARD_DATE'] = feature_vector

feature_vector = np.array([h*60+m for h, m in zip(board_times.dt.hour.tolist(), board_times.dt.minute.tolist())])
print("{}\t{}".format('BOARD_TIME', feature_vector.shape))
digit_feature_vectors['BOARD_TIME'] = feature_vector

feature_vector = np.array((arrival_times - initial_timestamp).astype('timedelta64[D]').tolist())
print("{}\t{}".format('ARRIVAL_DATE', feature_vector.shape))
digit_feature_vectors['ARRIVAL_DATE'] = feature_vector

feature_vector = np.array([h*60+m for h, m in zip(arrival_times.dt.hour.tolist(), arrival_times.dt.minute.tolist())])
print("{}\t{}".format('ARRIVAL_TIME', feature_vector.shape))
digit_feature_vectors['ARRIVAL_TIME'] = feature_vector


BOARD_DATE	(985759,)
BOARD_TIME	(985759,)
ARRIVAL_DATE	(985759,)
ARRIVAL_TIME	(985759,)


In [12]:
len(simple_feature2idx['PPID'])

1903

In [13]:
from collections import Counter

In [14]:
month2days = {0: -1, 1: 30, 2: 59, 3: 90, 4: 120, 5: 151, 6: 181, 7:212, 8:243, 9:273, 10:304, 11:334, 12:365}
days2month = {idx2:idx1 for idx1, idx2 in month2days.items()}

0 PPID

1 BUYYER_PID

2 START_STA

3 ARRIVAL_STA

4 TRAIN_CODE

5 TRAIN_TYPE

6 SEAT_TYPE


7 TRAVEL_LENGTH

8 SEAT_NO

9 COACH_NO

10 BOARD_DATE

11 BOARD_TIME

12 ARRIVAL_DATE

13 ARRIVAL_TIME

# Advanced features we use in our project are 
6. Buyyer is the same with Passenger? BUYYER_PID/PPID  new: SAME_PERSON  
    - Format: a integer 1 or 0

# 讨论后需要的features，以ppid为单位
2. 总乘车次数（按月统计？）NUM_TRIP
1. 最高频率线路占比：通勤（统计每个id的所有站，top2占比？） ROUTE_FREQ
4. 夜间乘车次数（比如定义23:00-5:00属于夜间）NIGHT_TRIP
3. 短时间出行次数（time<3？） SHORT_TRIP
5. 一线城市比例（始发站or终点站）BIG_CITY_FREQ
6. 站票/硬座比例  HARD_SEAT

In [15]:
advanced_feature_vectors = {'NUM_TRIP':[], 'ROUTE_FREQ':[], 'SHORT_TRIP':[], 'NIGHT_TRIP':[], 'BIG_CITY_FREQ':[], 'HARD_SEAT':[]}

In [16]:
all_feature_vectors = np.stack(list(simple_feature_vectors.values()) + 
                               list(digit_feature_vectors.values()), axis=0)
print(all_feature_vectors.shape)

(14, 985759)


In [17]:
passenger_rows_index = {ppid: np.where(simple_feature_vectors['PPID'] == simple_feature2idx['PPID'][ppid]) for ppid in simple_feature2idx['PPID']}

In [18]:
passenger_rows_index_small = {ppid:passenger_rows_index[ppid] for ppid in list(passenger_rows_index.keys())[:5]}

In [19]:
all_final_features = []
for ppid, index in passenger_rows_index.items():
# for ppid, index in passenger_rows_index_small.items():
    index = index[0]
    features = np.zeros(22)
    vectors = all_feature_vectors[:,index]
    num_trip_all = np.array(vectors.shape[1])   # 0
    features[0] = num_trip_all
#     print(features)
    
    # 总乘车次数（按月统计？）NUM_TRIP
    num_trip_monthly = np.zeros(12)   # 1-12
    for month in range(1, 13):
        min_days = month2days[month-1]
        max_days = month2days[month]
        month_trip_index = np.where((max_days>=vectors[10,:]) == (vectors[10,:]>=min_days))[0]
        num_trip_monthly[month-1] = month_trip_index.shape[0]
    features[1:13] = num_trip_monthly
#     print(features)

    # 最高频率线路占比：通勤（统计每个id的所有站，top2占比？） ROUTE_FREQ
    all_routes = [(vectors.astype(np.int)[2, i], vectors.astype(np.int)[3, i]) for i in range(num_trip_all)]
    route_counter = Counter(all_routes)
    num_routes = np.array(len(all_routes))  # 13
    num_most_freq_route = np.array(route_counter.most_common(1)[0][1])  # 14
    try:
        num_second_freq_route = np.array(route_counter.most_common(2)[1][1])  # 15
    except:
        num_second_freq_route = 0
    portion_most_freq_route = np.array(route_counter.most_common(1)[0][1]/num_trip_all)  # 16
    portion_second_freq_route = np.array(num_second_freq_route/num_trip_all)  # 17
    features[13] = num_routes
    features[14] = num_most_freq_route
    features[15] = num_second_freq_route
    features[16] = portion_most_freq_route
    features[17] = portion_second_freq_route
#     print(features)

    # 夜间乘车次数（比如定义23:00-5:00属于夜间）NIGHT_TRIP
    start_night_index = np.where( (vectors[11,:]>23*60) == (vectors[11,:]<5*60))[0]
    num_start_night_index = np.array(start_night_index.shape[0])  # 18
    end_night_index = np.where( (vectors[13,:]>23*60) == (vectors[13,:]<5*60))[0]
    num_end_night_index = np.array(end_night_index.shape[0])  # 19
    features[18] = num_start_night_index
    features[19] = num_end_night_index
#     print(features)

    # 短时间出行次数（time<3？） SHORT_TRIP
    trip_hours = (vectors[13,:] - vectors[11,:])/60 + (vectors[12,:] - vectors[10,:])*24  
    num_short_trip = np.where(trip_hours<3)[0].shape[0]  # 20
    features[20] = num_short_trip
#     print(features)
    
    # 站票/硬座比例 HARD_SEAT
    num_hardseat = np.where(vectors[6,:]==simple_feature2idx['SEAT_TYPE']['YZ'])[0].shape[0]
    portion_hardseat = num_hardseat / num_trip_all  # 21
    features[21] = portion_hardseat
    
    # TODO 一线城市比例（始发站or终点站）BIG_CITY_FREQ
    
    all_final_features.append(features)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


In [20]:
all_final_features = np.stack(all_final_features, axis=0)
print(all_final_features.shape)

(1903, 22)


In [21]:
np.save('./feature_vector_v1.npy', all_final_features)