# Taxi_Task_Data_Preprocessing
* input.
    - train.csv
    - test.csv
* output.
    - train_preprocessing.csv
    - test_preprocessing.csv
* description.
    - we clean the data and generate some feature by raw data

In [3]:
import pandas as pd
import datetime
import ast
import time

from joblib import load, dump

import math
from sklearn.cluster import KMeans

# Read training/testing data

In [2]:
train_data = pd.read_csv('train.csv')
train_data.shape

(1710670, 9)

In [3]:
test_data = pd.read_csv('test.csv')
test_data.shape

(320, 9)

# Clean

Clean the data with missing value

In [2]:
def clean_missing_true(df):
    return df[df.MISSING_DATA==False].reset_index(drop=True)
def clean_empty_polyline(df):
    return df[df.POLYLINE != '[]'].reset_index(drop=True)

def TaxiData_Cleaning(df,mode='train'):
    if mode == 'test':
        print('Can not clean the testing data.')
        return None
    else:
        tmp = df.shape
        print('before first clean, shape:',tmp)
        
        df = clean_missing_true(df)
        df = clean_empty_polyline(df)
        
        print('after cleaning')
        print('\tdelete',tmp[0]-df.shape[0],'records')
        print('\tnow, the shape of dataset:',df.shape)
        return df

# EDA: some categorical feature

In [7]:
train_data.CALL_TYPE.value_counts()

B    817881
C    528019
A    364770
Name: CALL_TYPE, dtype: int64

In [14]:
train_data.ORIGIN_STAND.value_counts()

15.0    80187
57.0    50740
9.0     34767
33.0    34107
23.0    33269
        ...  
41.0      506
43.0      493
8.0       378
5.0        53
48.0        7
Name: ORIGIN_STAND, Length: 63, dtype: int64

In [6]:
train_data.DAY_TYPE.value_counts()

A    1710670
Name: DAY_TYPE, dtype: int64

In [7]:
train_data.MISSING_DATA.value_counts()

False    1710660
True          10
Name: MISSING_DATA, dtype: int64

In [18]:
train_data.TAXI_ID.value_counts()

20000483    6704
20000621    6549
20000424    6459
20000089    6421
20000307    6272
            ... 
20000312     491
20000248     345
20000585     270
20000079      90
20000170       7
Name: TAXI_ID, Length: 439, dtype: int64

# Preprocessing Func

- timestamp 2 datatime
- get start and end point from polyline

## timestamp 2 datetime

In [3]:
def timestamp2dt(df):
    '''
    param. df:dataframe
    return. df:dataframe
    description.
        df['timestamp'] -> df['year'],df['month'],df['day'],df['hour'],df['min'],df['weekday']
        e.g.
            1372636858 -> 2013, 7, 1, 0, 0, 0 (monday)
        df['weekday']: 0 for Monday, 6 for Sunday

    '''
    df["TIMESTAMP"] = [float(time) for time in df["TIMESTAMP"]]
    df["data_time"] = [datetime.datetime.fromtimestamp(time, datetime.timezone.utc) for time in df["TIMESTAMP"]]

    df["year"] = df["data_time"].dt.year
    df["month"] = df["data_time"].dt.month
    df["day"] = df["data_time"].dt.day
    df["hour"] = df["data_time"].dt.hour
    df["min"] = df["data_time"].dt.minute
    df["weekday"] = df["data_time"].dt.weekday # 0 for Monday, 6 for Sunday
    
    return df

## weekday group

In [4]:
def weekday_group_gen(df):
    '''
    param. df:dataframe
    return. df:dataframe
    description.
        Saturday, Sunday -> group 0 (peak)
        Monday - Friday -> group 1 (off-peak)
    '''
    # key: cluster
    weekday_mapping = { 0: 1, 
                        1: 1, 
                        2: 1, 
                        3: 1, 
                        4: 1, 
                        5: 0, 
                        6: 0 }
    df['weekday_group'] = df['weekday'].apply(lambda x:weekday_mapping[x])
    return df

## hour group

In [1]:
# def hr2group_dict_gen():
#     df_tmp = pd.read_csv('train_preprocessing.csv')
#     avg_period = dict()
#     for i in range(24):
#         avg_period[i] = (sum(df_tmp[df_tmp.hour == i].period)/len(df_tmp[df_tmp.hour == i]))

#     hour2group = dict()
#     curr_base = 612.553176503733
#     curr_group_count = 0
#     for k,v in dict(sorted(avg_period.items(), key=lambda item: item[1])).items():
#         if(v<curr_base+50):
#             hour2group[k] = curr_group_count
#         else:
#             curr_base = v
#             curr_group_count += 1
#             hour2group[k] = curr_group_count
#     return hour2group

In [6]:
def hour_group_gen(df):
    hour2group = {  1: 0,
                    0: 0,
                    2: 0,
                    23: 0,
                    22: 0,
                    21: 0,
                    3: 0,
                    4: 1,
                    20: 1,
                    5: 1,
                    6: 1,
                    19: 2,
                    12: 2,
                    13: 2,
                    9: 2,
                    11: 2,
                    7: 2,
                    10: 2,
                    14: 3,
                    8: 3,
                    15: 3,
                    18: 3,
                    16: 4,
                    17: 4  }
    hour_group = list()
    for i in df.index:
        hour_group.append(hour2group[df.hour[i]])
    df['hour_group'] = hour_group
    return df

## extract_start_end_point

In [7]:
def extract_start_end_point(df,mode='train',test_EndPoint_source = 'y_test_end_point.csv'):
    '''
    param. df:dataframe
    return. df:dataframe
    description.
        extract the start point, end point and period(travel time)
        df['POLYLINE'] -> df['start_lon'], 
                          df['start_lat'], 
                          df['end_lon'],
                          df['end_lat'],
                          df['period']  (only train mode)
        period = num of points in polyline * 15
        
    '''
    if mode=='test':
        start_lon_list = []
        start_lat_list = []
        for i in df.index:
            try:
                point_list = ast.literal_eval(df['POLYLINE'][i])
                start_point = point_list[0]
                start_lon_list.append(start_point[0])
                start_lat_list.append(start_point[1])
            except:
                print('fail to extract start and end lon,lat: ',i)
                start_lon_list.append(0)
                start_lat_list.append(0)

        df['start_lon'] = start_lon_list
        df['start_lat'] = start_lat_list
    
        # read end point predicted by task1
        if test_EndPoint_source == 'not_used_end_point':
            print('not_used_end_point')
            pass
        elif test_EndPoint_source != None:
            try:
                df[['end_lon','end_lat']] = pd.read_csv(test_EndPoint_source)[['LONGITUDE','LATITUDE']]
            except:
                print('Fail to read TestingData.EndPoint source:',test_EndPoint_source)
                print('Check the file:')
                print('\t1. exist?')
                print('\t2. Has the column name: [\'LONGITUDE\',\'LATITUDE\'] ?')

    else:
        start_lon_list = []
        start_lat_list = []
        mid_lon_list=[]
        mid_lat_list=[]
        end_lon_list = []
        end_lat_list = []
        period_list = []

        for i in df.index:
            try:
                point_list = ast.literal_eval(df['POLYLINE'][i])
                start_point = point_list[0]
                start_lon_list.append(start_point[0])
                start_lat_list.append(start_point[1])

                mid_point = point_list[math.floor(len(point_list)/2)]
                mid_lon_list.append(mid_point[0])
                mid_lat_list.append(mid_point[1])

                end_point = point_list[-1]
                end_lon_list.append(end_point[0])
                end_lat_list.append(end_point[1])

                period_list.append( (len(point_list)-1) * 15 )
            except:
                print('fail to extract start and end lon,lat: ',i)
                start_lon_list.append(0)
                start_lat_list.append(0)
                mid_lon_list.append(0)
                mid_lat_list.append(0)
                end_lon_list.append(0)
                end_lat_list.append(0)
                period_list.append(0)
            
        df['start_lon'] = start_lon_list
        df['start_lat'] = start_lat_list
        df['mid_lon'] = mid_lon_list
        df['mid_lat'] = mid_lat_list
        df['end_lon'] = end_lon_list
        df['end_lat'] = end_lat_list
        df['period'] = period_list

    return df

## get distance between start, end and drop the row if its distance <= 1

In [8]:
def lon_lat_distance(latA, lonA, latB, lonB):
    '''
    param. (lon,lat) of A point, B point
    return. distance:float
    description.
        compute the distance between 2 points with the formula provided by kaggle competition
    '''
    lat = abs(latA-latB)*math.pi/180
    lon = abs(lonA-lonB)*math.pi/180
    latA = latA*math.pi/180
    latB = latB*math.pi/180
    a = math.sin(lat/2)*math.sin(lat/2)+math.cos(latA)*math.cos(latB)*math.sin(lon/2)*math.sin(lon/2)
    distance = 2*math.atan2(math.sqrt(a),math.sqrt(1-a))
    distance = 6371*distance

    return distance

def get_distance(df, mode='train',distance_threshold=1):
    '''
    param. df:dataframe
    return. df:dataframe
    description.
        df['distance'] <- df['start_lat'], df['start_lon'], df['end_lat'], df['end_lon']
    '''
    try:
        df['distance'] = [lon_lat_distance(df['start_lat'][i], df['start_lon'][i], df['end_lat'][i], df['end_lon'][i]) for i in df.index]
    except:
        print('fail to gen distance')
    if mode=='train':
        return clean_short_distance(df,threshold=distance_threshold)
    return df

def clean_short_distance(df,threshold=1):
    '''
    param. df:dataframe
    return. df:dataframe
    '''
    return df[df.distance>threshold].reset_index(drop=True)

## combine feature CALL_TYPE and ORIGIN_STAND

In [9]:
def combine_call_type_origin_stand(df):
    '''
    param. df:dataframe
    return. df:dataframe
    description.
        df['CALL_TYPE_STAND'] <- df['CALL_TYPE'] + df['ORIGIN_STAND']
        e.g. 'B7' <- 'B' + 7
        
        we try it since we suppose the feature 'CALL_TYPE' and 'ORIGIN_STAND' could imply some infomation about start point
        e.g. start from specific station or hotspot
    '''
    df['CALL_TYPE_STAND'] =  list(map(lambda x:x if '0' in x else x[0], df['CALL_TYPE'] + df['ORIGIN_STAND'].astype(str)))
    return df

## Point Clustering

### point_cluster

In [10]:
def point_cluster(df,n_clusters=70,mode='train',test_end_cluster_method = 'KMEANS'):
    '''
    param. df:dataframe
    return. df:dataframe
    description.
        df['StartCluster']  <- df['start_lon'], df['start_lat']
        df['endCluster']  <- df['end_lon'], df['end_lat'] 

        points -> KMEANS -> every point belong to a cluster

        in test mode: 
            StartCluster. 
                load the trained KMEANS model to predict a cluster for each start point
            EndCluster.
                1. test_end_cluster_method == 'KMEANS'
                    load the trained KMEANS model to predict a cluster for each end point
                2. test_end_cluster_method == 'RF'
                    load the EndCluster predicted by RF model
                    trace the code 'EndClusterModel.ipynb' if you wanna know the detail about this case
    '''
    if mode == 'test':
        # test.StartCluster
        model_filename = 'PointClusterModel' + str(n_clusters) + '.joblib'
        cluster_model = load(model_filename)
        df['StartCluster'] = cluster_model.predict(df[['start_lon','start_lat']].to_numpy())
        
        # 2 case for testing data generate the EndCluster
        # case1. same as StartCluster
        if test_end_cluster_method == 'KMEANS':
            df['EndCluster'] = cluster_model.predict(df[['end_lon','end_lat']].to_numpy())
        elif test_end_cluster_method == 'RF':
        # case2. load EndCluster predicted by classifier
            endcluster_source = pd.read_csv('TestEndCluster' + str(n_clusters) + '.csv')
            df['EndCluster'] = endcluster_source['EndCluster']

    elif mode == 'train':     
        # init the model
        cluster_model = KMeans(n_clusters=n_clusters,init='k-means++',random_state=87)

        # set all the point (for training)
        lon_tmp = list(df['end_lon'])
        lon_tmp.extend(list(df['start_lon']))
        lat_tmp = list(df['end_lat'])
        lat_tmp.extend(list(df['start_lat']))
        tmp = pd.DataFrame({'lon':lon_tmp,'lat':lat_tmp}).to_numpy()

        # clustering
        cluster_model.fit(tmp)

        # convert start/end point to cluster
        df['StartCluster'] = cluster_model.predict(df[['start_lon','start_lat']].to_numpy())
        df['EndCluster'] = cluster_model.predict(df[['end_lon','end_lat']].to_numpy())
        model_filename = 'PointClusterModel' + str(n_clusters) + '.joblib'
        
        # save the KMEANS model for testing
        dump(cluster_model,model_filename)

        # save the result of kmeans
        tmp = pd.DataFrame({'lon':lon_tmp,'lat':lat_tmp})
        tmp['Cluster'] = cluster_model.predict(tmp.to_numpy())
        tmp.to_csv('ResultOfCluster.csv',index=False)
    return df

### generate_EndClusterTrainData

In [11]:
def generate_EndClusterTrainData(k):
    mode = 'train'
    tmp = pd.read_csv('train.csv')
    tmp = TaxiData_Cleaning(tmp, mode='train')
    tmp = extract_start_end_point(tmp,mode=mode)
    tmp = point_cluster(tmp,n_clusters=k,mode=mode)
    tmp = timestamp2dt(tmp)
    tmp = combine_call_type_origin_stand(tmp)
    tmp[['CALL_TYPE_STAND','month','hour','weekday','StartCluster','EndCluster']]\
        .to_csv('EndClusterTrainData'+str(k)+'.csv',index=False)

## Summary of Preprocessing

In [12]:
def TaxiData_Preprocessing(df,mode='train',distance_threshold=1,k_cluster=50,test_EndPoint_source = 'y_test_end_point.csv',test_end_cluster_method='KMEANS'):
    df = timestamp2dt(df)
    df = hour_group_gen(df)
    df = extract_start_end_point(df,mode=mode,test_EndPoint_source=test_EndPoint_source)
    df = get_distance(df,mode=mode,distance_threshold=distance_threshold)
    df = combine_call_type_origin_stand(df)
    df = point_cluster(df,n_clusters=k_cluster,mode=mode,test_end_cluster_method=test_end_cluster_method)
    # feature_list = ['TAXI_ID','CALL_TYPE_STAND','CALL_TYPE','ORIGIN_STAND',
    #                 'month','hour','weekday',
    #                 'start_lon','start_lat','end_lon','end_lat',
    #                 'StartCluster', 'EndCluster','distance']
    # if mode == 'train':
    #     feature_list.append('period')
    # df = df[feature_list]
    return df

In [13]:
def SaveDataset(df,filename,feature_list = ['TAXI_ID','CALL_TYPE_STAND','CALL_TYPE','ORIGIN_STAND', 'month','hour','weekday', 'start_lon','start_lat','end_lon','end_lat', 'StartCluster', 'EndCluster','distance']):
    df[feature_list].to_csv(filename,index=False)
    print('Successfully Save ',filename)

# Training Set Filter Method

## angle

In [14]:
def Cal_Angle(df,remain_angle):
    tmp = []
    for i in df.index:
        l1 = lon_lat_distance(df['start_lat'][i],df['start_lon'][i],df['end_lat'][i],df['end_lon'][i])
        l2 = lon_lat_distance(df['start_lat'][i],df['start_lon'][i],df['mid_lat'][i],df['mid_lon'][i])
        l3 = lon_lat_distance(df['mid_lat'][i],df['mid_lon'][i],df['end_lat'][i],df['end_lon'][i])
        if l1*l2 != 0 :
            try:
                tmp_param = (l1**2+l2**2-l3**2)/(2*l1*l2)
                tmp_param = round(tmp_param,3)
                ans = math.degrees(math.acos(tmp_param))
            except:
                print((l1**2+l2**2-l3**2)/(2*l1*l2))
        else:
            ans = 1000
        tmp.append(ans)
    df['angle'] = tmp
    df = df[df.angle<=remain_angle]
    return df

# Preprocess & Save

## 1.a Task1 -> endpoint of testing data

### train_1a

In [16]:
train_data = pd.read_csv('train.csv')
train_data = TaxiData_Cleaning(train_data, mode='train')

before first clean, shape: (1710670, 9)
after cleaning
	delete 5911 records
	now, the shape of dataset: (1704759, 9)


In [50]:
train_data = TaxiData_Preprocessing(
    train_data,
    mode='train',
    k_cluster=50,
    distance_threshold=1)

In [51]:
# train_data.to_csv('train_preprocessing.csv',index=False)
train_data.to_csv('train_1a.csv',index=False)

In [71]:
pd.read_csv('train_1a.csv')

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,data_time,...,start_lat,mid_lon,mid_lat,end_lon,end_lat,period,distance,CALL_TYPE_STAND,StartCluster,EndCluster
0,1372636858620000589,C,,,20000589,1.372637e+09,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",2013-07-01 00:00:58+00:00,...,41.141412,-8.629110,41.151213,-8.630838,41.154489,330,1.776808,C,3,21
1,1372637303620000596,B,,7.0,20000596,1.372637e+09,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",2013-07-01 00:08:23+00:00,...,41.159826,-8.663112,41.163687,-8.665740,41.170671,270,2.480360,B7.0,27,1
2,1372636854620000520,C,,,20000520,1.372637e+09,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",2013-07-01 00:00:54+00:00,...,41.151951,-8.588205,41.148963,-8.607996,41.142915,630,2.965199,C,17,42
3,1372637091620000337,C,,,20000337,1.372637e+09,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",2013-07-01 00:04:51+00:00,...,41.180490,-8.671374,41.175180,-8.687268,41.178087,420,3.464589,C,1,9
4,1372636965620000231,C,,,20000231,1.372637e+09,A,False,"[[-8.615502,41.140674],[-8.614854,41.140926],[...",2013-07-01 00:02:45+00:00,...,41.140674,-8.579133,41.144238,-8.578224,41.160717,375,3.835220,C,3,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1458718,1404155105620000121,B,,9.0,20000121,1.404155e+09,A,False,"[[-8.606385,41.144742],[-8.606466,41.144742],[...",2014-06-30 19:05:05+00:00,...,41.144742,-8.593299,41.177214,-8.670150,41.236866,1050,11.550011,B9.0,42,8
1458719,1404171463620000698,C,,,20000698,1.404171e+09,A,False,"[[-8.612469,41.14602],[-8.612487,41.145993],[-...",2014-06-30 23:37:43+00:00,...,41.146020,-8.611902,41.155749,-8.611344,41.171013,465,2.780690,C,28,45
1458720,1404171367620000670,C,,,20000670,1.404171e+09,A,False,"[[-8.610138,41.140845],[-8.610174,41.140935],[...",2014-06-30 23:36:07+00:00,...,41.140845,-8.613108,41.148234,-8.627454,41.158755,435,2.463359,C,28,21
1458721,1404141826620000248,B,,12.0,20000248,1.404142e+09,A,False,"[[-8.630712,41.154885],[-8.63073,41.154813],[-...",2014-06-30 15:23:46+00:00,...,41.154885,-8.614476,41.162184,-8.587026,41.173524,915,4.203449,B12.0,21,44


### test_1a

In [54]:
test_data = pd.read_csv('test.csv')
test_data = TaxiData_Preprocessing(
    df = test_data,
    mode='test',
    k_cluster=50,
    distance_threshold=1,
    test_EndPoint_source = 'y_test_end_point.csv',
)

In [55]:
test_data.to_csv('test_1a.csv',index=False)

## 1.b RF clf -> EndCluster of testing

### train_1b

In [56]:
train_data = pd.read_csv('train.csv')
train_data = TaxiData_Cleaning(train_data, mode='train')
train_data = TaxiData_Preprocessing(
    train_data,
    mode='train',
    k_cluster=50,
    distance_threshold=1)

before first clean, shape: (1710670, 9)
after cleaning
	delete 5911 records
	now, the shape of dataset: (1704759, 9)


In [59]:
train_data.to_csv('train_1b.csv',index=False)

### test_1b

In [67]:
test_data = pd.read_csv('test.csv')
test_data = TaxiData_Preprocessing(
    df = test_data,
    mode='test',
    k_cluster=50,
    distance_threshold=1,
    test_EndPoint_source = 'not_used_end_point',
    test_end_cluster_method='RF'
)

not_used_end_point
fail to gen distance


In [70]:
test_data.to_csv('test_1b.csv',index=False)

## 2.a using 

In [73]:
train_data = pd.read_csv('train_1a.csv')
test_data = pd.read_csv('test_1a.csv')

In [75]:
train_data = weekday_group_gen(train_data)
test_data = weekday_group_gen(test_data)

In [79]:
train_data.to_csv('train_2a.csv',index=False)
test_data.to_csv('test_2a.csv',index=False)

## cluster: 2500

In [15]:
train_data = pd.read_csv('train.csv')
train_data = TaxiData_Cleaning(train_data, mode='train')

train_data = TaxiData_Preprocessing(
    train_data,
    mode='train',
    k_cluster=2500,
    distance_threshold=1,
)

before first clean, shape: (1710670, 9)
after cleaning
	delete 5911 records
	now, the shape of dataset: (1704759, 9)


In [16]:
train_data.to_csv('train_cluster2500.csv',index=False)

In [20]:
train_data = weekday_group_gen(train_data)
train_data = hour_group_gen(train_data)
train_data.columns

Index(['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID',
       'TIMESTAMP', 'DAY_TYPE', 'MISSING_DATA', 'POLYLINE', 'data_time',
       'year', 'month', 'day', 'hour', 'min', 'weekday', 'hour_group',
       'start_lon', 'start_lat', 'mid_lon', 'mid_lat', 'end_lon', 'end_lat',
       'period', 'distance', 'CALL_TYPE_STAND', 'StartCluster', 'EndCluster',
       'weekday_group'],
      dtype='object')

In [26]:
train_data[['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID','TIMESTAMP', 'DAY_TYPE',
       'year', 'month', 'day', 'hour', 'min', 'weekday', 'hour_group',
       'start_lon', 'start_lat', 'mid_lon', 'mid_lat', 'end_lon', 'end_lat',
       'period', 'distance', 'CALL_TYPE_STAND', 'StartCluster', 'EndCluster',
       'weekday_group']].to_csv('train_cluster2500.csv',index=False)

In [17]:
test_data = pd.read_csv('test.csv')
test_data = TaxiData_Preprocessing(
    df = test_data,
    mode='test',
    k_cluster=2500,
    distance_threshold=1,
    test_EndPoint_source = 'y_test_end_point.csv',
)

In [18]:
test_data.to_csv('test_cluster2500.csv',index=False)

In [22]:
test_data = weekday_group_gen(test_data)
test_data = hour_group_gen(test_data)

In [25]:
test_data[['TRIP_ID', 'CALL_TYPE', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID','TIMESTAMP', 'DAY_TYPE',
       'year', 'month', 'day', 'hour', 'min', 'weekday', 'hour_group',
       'start_lon', 'start_lat', 'end_lon', 'end_lat'
       , 'distance', 'CALL_TYPE_STAND', 'StartCluster', 'EndCluster',
       'weekday_group']].to_csv('test_cluster2500.csv',index=False)

## angle filter -> training

In [19]:
train_data = pd.read_csv('train.csv')
train_data = TaxiData_Cleaning(train_data, mode='train')
train_data = TaxiData_Preprocessing(train_data, mode='train')
train_data = Cal_Angle(train_data,60)

before first clean, shape: (1710670, 9)
after cleaning
	delete 5911 records
	now, the shape of dataset: (1704759, 9)


In [22]:
SaveDataset(
    df = train_data,
    filename = 'train_preprocessing_angle_filter.csv',
    feature_list = ['TAXI_ID','CALL_TYPE_STAND','CALL_TYPE','ORIGIN_STAND',
                    'month','hour','weekday',
                    'start_lon','start_lat','end_lon','end_lat',
                    'StartCluster', 'EndCluster','distance','period']
)

Successfully Save  train_preprocessing_angle_filter.csv


## test

In [40]:
test_data = pd.read_csv('test.csv')
test_data

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,T1,B,,15.0,20000542,1408039037,A,False,"[[-8.585676,41.148522],[-8.585712,41.148639],[..."
1,T2,B,,57.0,20000108,1408038611,A,False,"[[-8.610876,41.14557],[-8.610858,41.145579],[-..."
2,T3,B,,15.0,20000370,1408038568,A,False,"[[-8.585739,41.148558],[-8.58573,41.148828],[-..."
3,T4,B,,53.0,20000492,1408039090,A,False,"[[-8.613963,41.141169],[-8.614125,41.141124],[..."
4,T5,B,,18.0,20000621,1408039177,A,False,"[[-8.619903,41.148036],[-8.619894,41.148036]]"
...,...,...,...,...,...,...,...,...,...
315,T323,A,70885.0,,20000430,1419171485,A,False,"[[-8.570196,41.159484],[-8.570187,41.158962],[..."
316,T324,B,,53.0,20000020,1419170802,A,False,"[[-8.613873,41.141232],[-8.613882,41.141241],[..."
317,T325,C,,,20000207,1419172121,A,False,"[[-8.6481,41.152536],[-8.647461,41.15241],[-8...."
318,T326,A,76232.0,,20000667,1419171980,A,False,"[[-8.571699,41.156073],[-8.570583,41.155929],[..."


In [41]:
test_data = TaxiData_Preprocessing(test_data,mode='test',k_cluster=50)

In [43]:
SaveDataset(
    df = test_data,
    filename = 'test_preprocessing_angle_filter.csv',
    feature_list = ['TAXI_ID','CALL_TYPE_STAND','CALL_TYPE','ORIGIN_STAND',
                    'month','hour','weekday',
                    'start_lon','start_lat','end_lon','end_lat',
                    'StartCluster', 'EndCluster','distance']
)

Successfully Save  test_preprocessing_angle_filter.csv


## Save current dataset

In [11]:
train_data

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,DAY_TYPE,MISSING_DATA,POLYLINE,year,month,...,min,weekday,start_lon,start_lat,end_lon,end_lat,period,distance,CALL_TYPE_STAND,Cluster
0,1372636858620000589,C,,,20000589,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",2013,7,...,0,0,-8.618643,41.141412,-8.630838,41.154489,330,1.776808,C,52
1,1372637303620000596,B,,7.0,20000596,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",2013,7,...,8,0,-8.639847,41.159826,-8.665740,41.170671,270,2.480360,B7.0,23
2,1372636854620000520,C,,,20000520,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",2013,7,...,0,0,-8.574678,41.151951,-8.607996,41.142915,630,2.965199,C,11
3,1372637091620000337,C,,,20000337,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",2013,7,...,4,0,-8.645994,41.180490,-8.687268,41.178087,420,3.464589,C,15
4,1372636965620000231,C,,,20000231,A,False,"[[-8.615502,41.140674],[-8.614854,41.140926],[...",2013,7,...,2,0,-8.615502,41.140674,-8.578224,41.160717,375,3.835220,C,66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1458718,1404155105620000121,B,,9.0,20000121,A,False,"[[-8.606385,41.144742],[-8.606466,41.144742],[...",2014,6,...,5,0,-8.606385,41.144742,-8.670150,41.236866,1050,11.550011,B9.0,5
1458719,1404171463620000698,C,,,20000698,A,False,"[[-8.612469,41.14602],[-8.612487,41.145993],[-...",2014,6,...,37,0,-8.612469,41.146020,-8.611344,41.171013,465,2.780690,C,54
1458720,1404171367620000670,C,,,20000670,A,False,"[[-8.610138,41.140845],[-8.610174,41.140935],[...",2014,6,...,36,0,-8.610138,41.140845,-8.627454,41.158755,435,2.463359,C,52
1458721,1404141826620000248,B,,12.0,20000248,A,False,"[[-8.630712,41.154885],[-8.63073,41.154813],[-...",2014,6,...,23,0,-8.630712,41.154885,-8.587026,41.173524,915,4.203449,B12.0,63


In [12]:
now = datetime.datetime.fromtimestamp(time.time())
now = str(now.month)+str(now.day)
train_data.to_csv('train_data_' + now + '.csv' ,index=False)
print('save the dataframe -> csv :','train_data_' + now + '.csv')

save the dataframe -> csv : train_data_1213.csv
