In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

import pickle

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.DataFrame(columns=['DAYOFSERVICE', 'STARTSTOP', 'LINEID', 'HOUR', 'ENDSTOP', 'JOURNEYTIME', 'PLANNED_JOURNEYTIME', 'STOP_TIME'])
df

Unnamed: 0,DAYOFSERVICE,STARTSTOP,LINEID,HOUR,ENDSTOP,JOURNEYTIME,PLANNED_JOURNEYTIME,STOP_TIME


In [3]:
for month_num in range(1, 13):
    print(f'Now concat month{month_num}')
    df1 = pd.read_csv(f'leavetimes_premodeling_month_{month_num}.csv')
    df1.drop(['TRIPID', 'MONTH', 'DAY'], axis = 1, inplace = True)
    row = df1.shape[0]
    print(f'month{month_num} have ' + str(row) + ' rows')
    df = pd.concat([df, df1], ignore_index=True)

Now concat month1
month1 have 9049608 rows
Now concat month2
month2 have 8003315 rows
Now concat month3
month3 have 7678954 rows
Now concat month4
month4 have 8431443 rows
Now concat month5
month5 have 8813183 rows
Now concat month6
month6 have 8374976 rows
Now concat month7
month7 have 8638830 rows
Now concat month8
month8 have 8585436 rows
Now concat month9
month9 have 8194853 rows
Now concat month10
month10 have 8443207 rows
Now concat month11
month11 have 8220643 rows
Now concat month12
month12 have 6987331 rows


In [4]:
df.shape

(99421779, 8)

In [5]:
df.head()

Unnamed: 0,DAYOFSERVICE,STARTSTOP,LINEID,HOUR,ENDSTOP,JOURNEYTIME,PLANNED_JOURNEYTIME,STOP_TIME
0,2018-01-01,7347,16_1,8,3669,99.0,80.0,0.0
1,2018-01-01,3669,16_1,8,7349,54.0,58.0,0.0
2,2018-01-01,7349,16_1,8,1631,73.0,68.0,0.0
3,2018-01-01,1631,16_1,8,1632,6.0,15.0,0.0
4,2018-01-01,1632,16_1,8,5053,13.0,25.0,0.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99421779 entries, 0 to 99421778
Data columns (total 8 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   DAYOFSERVICE         object
 1   STARTSTOP            object
 2   LINEID               object
 3   HOUR                 object
 4   ENDSTOP              object
 5   JOURNEYTIME          object
 6   PLANNED_JOURNEYTIME  object
 7   STOP_TIME            object
dtypes: object(8)
memory usage: 5.9+ GB


In [7]:
df.to_csv("leavetimes_premodeling_final.csv", index=False)

### Line

In [9]:
lineids = list(df['LINEID'].unique())

In [10]:
len(lineids)

252

In [11]:
for lineid in lineids:
    dirname = lineid
    os.mkdir(f'Lines/{dirname}')

In [11]:
for lineid in lineids:
    print(lineid)
    
    save_data = df[df['LINEID'] == lineid]
    file_name = f'Lines/{lineid}/{lineid}.csv'
    
    save_data.to_csv(file_name, index = False)
    
    df.drop(save_data.index, inplace = True)
    print(save_data.shape[0], df.shape[0])

16_1
1498516 97923263
16_2
1050614 96872649
16C_2
11415 96861234
40_1
1806445 95054789
40_2
2017032 93037757
25B_2
376157 92661600
25B_1
483695 92177905
25A_1
499413 91678492
25A_2
468896 91209596
15_1
1675108 89534488
15_2
1730374 87804114
47_1
166872 87637242
47_2
166925 87470317
33_2
498552 86971765
33_1
508606 86463159
33A_2
85208 86377951
145_1
1809218 84568733
145_2
1768302 82800431
37_2
480453 82319978
37_1
827293 81492685
16C_1
28444 81464241
46A_2
1883767 79580474
46A_1
1963947 77616527
41C_2
669918 76946609
41_1
717018 76229591
41_2
773396 75456195
41C_1
723817 74732378
83_1
944799 73787579
83_2
940453 72847126
83A_2
278798 72568328
83A_1
256722 72311606
42_2
663715 71647891
42_1
615344 71032547
84_1
435692 70596855
84_2
401746 70195109
120_1
392721 69802388
120_2
365209 69437179
11_2
789733 68647446
11_1
720819 67926627
56A_1
248780 67677847
56A_2
247584 67430263
18_1
503933 66926330
18_2
653945 66272385
27A_1
365707 65906678
27A_2
340878 65565800
17A_1
821098 64744702
17A_2