In [1]:
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import os
import pandas as pd
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

import pickle

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
lineid = '39A_2'

## Read

In [3]:
df = pd.read_csv(f'../Final_DB/Lines/{lineid}/{lineid}.csv')
df.head()

Unnamed: 0,DAYOFSERVICE,STARTSTOP,LINEID,HOUR,ENDSTOP,JOURNEYTIME,PLANNED_JOURNEYTIME,STOP_TIME
0,2018-01-01,7158,39A_2,10,7048,11.0,43.0,0.0
1,2018-01-01,7048,39A_2,10,7159,60.0,73.0,0.0
2,2018-01-01,7159,39A_2,10,7388,9.0,20.0,0.0
3,2018-01-01,7388,39A_2,10,7017,80.0,104.0,0.0
4,2018-01-01,7017,39A_2,10,7018,71.0,52.0,19.0


In [4]:
df['DAYOFSERVICE'] = pd.to_datetime(df.DAYOFSERVICE)

In [5]:
df['MONTH'] = df['DAYOFSERVICE'].dt.month_name()
df['DAY'] = df['DAYOFSERVICE'].dt.day_name()

## remove outliers

In [6]:
start_end_pairs = df.groupby(['STARTSTOP', 'ENDSTOP']).size().reset_index().rename(columns = {0 : 'COUNT'})

In [7]:
start_end_pairs = start_end_pairs.sort_values(by=['COUNT'], ascending=False)

In [8]:
start_end_pairs.reset_index(inplace=True, drop=True)
start_end_pairs

Unnamed: 0,STARTSTOP,ENDSTOP,COUNT
0,1703,1905,24225
1,1698,1699,24225
2,1699,1700,24224
3,1702,1703,24224
4,1701,1702,24223
...,...,...,...
65,748,749,23345
66,315,313,23170
67,313,335,22450
68,7158,7048,21280


### ONROAD_TIME <= 0

In [9]:
df['ONROAD_TIME'] = df['JOURNEYTIME'] - df['STOP_TIME']

In [10]:
df.drop(df[df['ONROAD_TIME'] <= 0].index, inplace = True)

df[df['ONROAD_TIME'] <= 0]

Unnamed: 0,DAYOFSERVICE,STARTSTOP,LINEID,HOUR,ENDSTOP,JOURNEYTIME,PLANNED_JOURNEYTIME,STOP_TIME,MONTH,DAY,ONROAD_TIME


### outliers

In [11]:
for i in range(start_end_pairs.shape[0]):
    pair = str(start_end_pairs.loc[i, 'STARTSTOP']) + '_' + str(start_end_pairs.loc[i, 'ENDSTOP'])
    
    data = df.loc[(df['STARTSTOP'] == start_end_pairs.loc[i, 'STARTSTOP']) & (df['ENDSTOP'] == start_end_pairs.loc[i, 'ENDSTOP'])]
   
    # clean the data
    outliers = data[(data['STOP_TIME'] > np.percentile(data['STOP_TIME'], 98)) | 
                    (data['STOP_TIME'] < np.percentile(data['STOP_TIME'], 1)) |
                    (data['ONROAD_TIME'] > np.percentile(data['ONROAD_TIME'], 98)) |
                    (data['ONROAD_TIME'] < np.percentile(data['ONROAD_TIME'], 1))]

    df.drop(outliers.index, inplace = True)

## Rush Hour

In [12]:
df['HOUR_BIN'] = pd.cut(x = df['HOUR'], bins = [0, 8, 20, 23], labels = [0, 1, 2])
df['RUSH_HOUR'] = 0

In [13]:
df.loc[df['HOUR_BIN'] == 1, 'RUSH_HOUR'] = 1

In [14]:
df.drop(['HOUR_BIN'], axis = 1, inplace = True)

## Weekday

In [15]:
df['WEEKDAY'] = 0

In [16]:
df.loc[((df['DAY'] == 'Saturday') | (df['DAY'] == 'Sunday')), 'WEEKDAY'] = 1

### outliers

In [17]:
for i in range(start_end_pairs.shape[0]):
    pair = str(start_end_pairs.loc[i, 'STARTSTOP']) + '_' + str(start_end_pairs.loc[i, 'ENDSTOP'])
    
    data = df.loc[(df['STARTSTOP'] == start_end_pairs.loc[i, 'STARTSTOP']) & (df['ENDSTOP'] == start_end_pairs.loc[i, 'ENDSTOP'])]
    
    # remove outliers
   
    outliers = data[(data.loc[data['RUSH_HOUR'] != 1, 'ONROAD_TIME'] > np.percentile(data.loc[data['RUSH_HOUR'] == 1, 'ONROAD_TIME'], 99)) | 
                    (data.loc[data['RUSH_HOUR'] != 1, 'STOP_TIME'] >  np.percentile(data.loc[data['RUSH_HOUR'] == 1, 'STOP_TIME'], 99)) | 
                    (data.loc[data['RUSH_HOUR'] == 1, 'ONROAD_TIME'] < np.percentile(data.loc[data['RUSH_HOUR'] != 1, 'ONROAD_TIME'], 1)) | 
                    (data.loc[data['RUSH_HOUR'] == 1, 'STOP_TIME'] <  np.percentile(data.loc[data['RUSH_HOUR'] != 1, 'STOP_TIME'], 1))]
    
    df.drop(outliers.index, inplace = True)

In [18]:
for i in range(start_end_pairs.shape[0]):
    pair = str(start_end_pairs.loc[i, 'STARTSTOP']) + '_' + str(start_end_pairs.loc[i, 'ENDSTOP'])
    
    data = df.loc[(df['STARTSTOP'] == start_end_pairs.loc[i, 'STARTSTOP']) & (df['ENDSTOP'] == start_end_pairs.loc[i, 'ENDSTOP'])]
    
    # remove outliers
    outliers = data[(data.loc[data['WEEKDAY'] != 1, 'ONROAD_TIME'] > np.percentile(data.loc[data['WEEKDAY'] == 1, 'ONROAD_TIME'], 99)) | 
                    (data.loc[data['WEEKDAY'] != 1, 'STOP_TIME'] >  np.percentile(data.loc[data['WEEKDAY'] == 1, 'STOP_TIME'], 99)) | 
                    (data.loc[data['WEEKDAY'] == 1, 'ONROAD_TIME'] < np.percentile(data.loc[data['WEEKDAY'] != 1, 'ONROAD_TIME'], 1)) | 
                    (data.loc[data['WEEKDAY'] == 1, 'STOP_TIME'] <  np.percentile(data.loc[data['WEEKDAY'] != 1, 'STOP_TIME'], 1))]
    
    df.drop(outliers.index, inplace = True)

### ONROAD_TIME + STOP_TIME, PLANNED_JOURNEYTIME 

In [19]:
for i in range(start_end_pairs.shape[0]):
    pair = str(start_end_pairs.loc[i, 'STARTSTOP']) + '_' + str(start_end_pairs.loc[i, 'ENDSTOP'])
    
    data = df.loc[(df['STARTSTOP'] == start_end_pairs.loc[i, 'STARTSTOP']) & (df['ENDSTOP'] == start_end_pairs.loc[i, 'ENDSTOP'])]
    
    # onroad_time is supposed to be very close to planned_journeytime
    outliers = data[((data['ONROAD_TIME'] + data['STOP_TIME'].max()) < data['PLANNED_JOURNEYTIME']) | 
                    ((data['ONROAD_TIME'].max() + data['STOP_TIME']) < data['PLANNED_JOURNEYTIME']) |
                    ((data['ONROAD_TIME'].min() + data['STOP_TIME']) > data['PLANNED_JOURNEYTIME'])]
    
    df.drop(outliers.index, inplace = True)

### zscore < 3

In [20]:
for i in range(start_end_pairs.shape[0]):
    pair = str(start_end_pairs.loc[i, 'STARTSTOP']) + '_' + str(start_end_pairs.loc[i, 'ENDSTOP'])
    
    data = df.loc[(df['STARTSTOP'] == start_end_pairs.loc[i, 'STARTSTOP']) & (df['ENDSTOP'] == start_end_pairs.loc[i, 'ENDSTOP'])]
    
    # remove outliers
    data = data[(np.abs(stats.zscore(data['ONROAD_TIME'])) > 3) | 
                (np.abs(stats.zscore(data['STOP_TIME'])) > 3)]
    
    df.drop(data.index, inplace = True)

In [21]:
df['JOURNEYTIME'] = df['STOP_TIME'] + df['ONROAD_TIME']
df[['JOURNEYTIME', 'PLANNED_JOURNEYTIME', 'STOP_TIME', 'ONROAD_TIME']].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
JOURNEYTIME,1082754.0,68.576718,66.43539,7.0,34.0,52.0,79.0,686.0
PLANNED_JOURNEYTIME,1082754.0,66.018797,61.827394,9.0,37.0,52.0,72.0,646.0
STOP_TIME,1082754.0,10.052651,15.162581,0.0,0.0,8.0,15.0,193.0
ONROAD_TIME,1082754.0,58.524067,56.400629,7.0,27.0,43.0,70.0,499.0


# Models

### merge with weather

In [22]:
# read weather
weather = pd.read_csv('../Final_DB/weather_cleaned.csv')
weather.head()

Unnamed: 0,dt,temp,visibility,dew_point,feels_like,pressure,humidity,wind_speed,wind_deg,wind_gust,rain_1h,snow_1h,clouds_all,weather_main
0,2018-01-01 00:00:00,4.39,9999.0,2.26,-1.93,990,86,12.9,240,18.0,0.0,0.0,40,Rain
1,2018-01-01 01:00:00,4.39,9999.0,2.26,-1.68,990,86,11.8,240,0.0,0.0,0.0,75,Rain
2,2018-01-01 02:00:00,5.39,9999.0,2.4,-0.47,990,81,12.4,240,18.5,0.0,0.0,40,Clouds
3,2018-01-01 03:00:00,5.39,9999.0,2.4,-0.47,990,81,12.4,240,0.0,0.0,0.0,40,Clouds
4,2018-01-01 04:00:00,5.39,9999.0,2.4,-0.33,989,81,11.8,240,0.0,0.0,0.0,40,Clouds


In [23]:
weather.drop(['visibility', 'dew_point', 'feels_like', 'pressure', 'humidity', 'wind_deg', 'wind_gust', 'clouds_all' ], 
             axis = 1, inplace = True)

weather.head()

Unnamed: 0,dt,temp,wind_speed,rain_1h,snow_1h,weather_main
0,2018-01-01 00:00:00,4.39,12.9,0.0,0.0,Rain
1,2018-01-01 01:00:00,4.39,11.8,0.0,0.0,Rain
2,2018-01-01 02:00:00,5.39,12.4,0.0,0.0,Clouds
3,2018-01-01 03:00:00,5.39,12.4,0.0,0.0,Clouds
4,2018-01-01 04:00:00,5.39,11.8,0.0,0.0,Clouds


In [24]:
weather['dt'] = pd.to_datetime(weather.dt)

weather.dtypes

dt              datetime64[ns]
temp                   float64
wind_speed             float64
rain_1h                float64
snow_1h                float64
weather_main            object
dtype: object

In [25]:
df['dt'] = df['DAYOFSERVICE'] + pd.to_timedelta(df['HOUR'], unit = 'H')

df.head()

Unnamed: 0,DAYOFSERVICE,STARTSTOP,LINEID,HOUR,ENDSTOP,JOURNEYTIME,PLANNED_JOURNEYTIME,STOP_TIME,MONTH,DAY,ONROAD_TIME,RUSH_HOUR,WEEKDAY,dt
1,2018-01-01,7048,39A_2,10,7159,60.0,73.0,0.0,January,Monday,60.0,1,0,2018-01-01 10:00:00
2,2018-01-01,7159,39A_2,10,7388,9.0,20.0,0.0,January,Monday,9.0,1,0,2018-01-01 10:00:00
6,2018-01-01,7030,39A_2,10,7021,18.0,31.0,0.0,January,Monday,18.0,1,0,2018-01-01 10:00:00
7,2018-01-01,7021,39A_2,10,6111,42.0,58.0,0.0,January,Monday,42.0,1,0,2018-01-01 10:00:00
9,2018-01-01,6112,39A_2,10,6113,76.0,81.0,0.0,January,Monday,76.0,1,0,2018-01-01 10:00:00


In [26]:
df = df.merge(weather, on = ['dt'])

# drop 'WEATHER_TIME' and 'dt'
df = df.drop(['DAYOFSERVICE', 'dt'], axis = 1)

df.head()

Unnamed: 0,STARTSTOP,LINEID,HOUR,ENDSTOP,JOURNEYTIME,PLANNED_JOURNEYTIME,STOP_TIME,MONTH,DAY,ONROAD_TIME,RUSH_HOUR,WEEKDAY,temp,wind_speed,rain_1h,snow_1h,weather_main
0,7048,39A_2,10,7159,60.0,73.0,0.0,January,Monday,60.0,1,0,5.39,9.8,0.0,0.0,Clouds
1,7159,39A_2,10,7388,9.0,20.0,0.0,January,Monday,9.0,1,0,5.39,9.8,0.0,0.0,Clouds
2,7030,39A_2,10,7021,18.0,31.0,0.0,January,Monday,18.0,1,0,5.39,9.8,0.0,0.0,Clouds
3,7021,39A_2,10,6111,42.0,58.0,0.0,January,Monday,42.0,1,0,5.39,9.8,0.0,0.0,Clouds
4,6112,39A_2,10,6113,76.0,81.0,0.0,January,Monday,76.0,1,0,5.39,9.8,0.0,0.0,Clouds


In [27]:
df = df.drop(['LINEID', 'PLANNED_JOURNEYTIME', 'RUSH_HOUR', 'WEEKDAY', 'weather_main'], axis = 1)

In [28]:
df = pd.get_dummies(df, columns = ['MONTH', 'DAY', 'HOUR'], drop_first = True)
df.head()

Unnamed: 0,STARTSTOP,ENDSTOP,JOURNEYTIME,STOP_TIME,ONROAD_TIME,temp,wind_speed,rain_1h,snow_1h,MONTH_August,...,HOUR_14,HOUR_15,HOUR_16,HOUR_17,HOUR_18,HOUR_19,HOUR_20,HOUR_21,HOUR_22,HOUR_23
0,7048,7159,60.0,0.0,60.0,5.39,9.8,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
1,7159,7388,9.0,0.0,9.0,5.39,9.8,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
2,7030,7021,18.0,0.0,18.0,5.39,9.8,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
3,7021,6111,42.0,0.0,42.0,5.39,9.8,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
4,6112,6113,76.0,0.0,76.0,5.39,9.8,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0


### journey time

In [29]:
# separate data for each start stop
X_trainDict = {}
y_trainDict = {}
X_testDict ={}
y_testDict = {}

# store data for start stops which have few data to train a model
remove = {}

for i in range(start_end_pairs.shape[0]):
    pair = str(start_end_pairs.loc[i, 'STARTSTOP']) + '_' + str(start_end_pairs.loc[i, 'ENDSTOP'])
    print(i, pair)
    
    data = df.loc[(df['STARTSTOP'] == start_end_pairs.loc[i, 'STARTSTOP']) & (df['ENDSTOP'] == start_end_pairs.loc[i, 'ENDSTOP'])]
    
    # check start stops which have few data to train a model
    if data.shape[0] < 100:
        print('----------------------------')
        print(f'the data of {pair} too small')
        print('----------------------------')
        remove[pair] = i
        continue
    
    # drop unnecessary columns
    data.drop(columns=['STARTSTOP', 'ENDSTOP'], inplace=True)    
    
    # target feature is 'JOURNEYTIME'
    y = data['JOURNEYTIME']
    X = data.drop(['JOURNEYTIME', 'ONROAD_TIME', 'STOP_TIME'], axis = 1)
    
    # Split the dataset into two datasets: 70% training and 30% test
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0,  test_size=0.3)
    
    X_trainDict[pair] = X_train
    X_testDict[pair] = X_test
    y_trainDict[pair] = y_train
    y_testDict[pair] = y_test
    
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)

0 1703_1905
1 1698_1699
2 1699_1700
3 1702_1703
4 1701_1702
5 1905_1906
6 1697_1698
7 1700_1701
8 1696_1697
9 1847_1696
10 7166_1847
11 1867_4489
12 1850_1867
13 1866_1850
14 1865_1866
15 1864_1865
16 1862_1863
17 1863_1864
18 1861_1862
19 4492_1861
20 1860_4492
21 1859_1860
22 1845_7166
23 1479_315
24 1478_1479
25 1858_1859
26 7453_1478
27 6112_6113
28 1906_1907
29 6111_6112
30 1715_7453
31 1898_1858
32 6113_1898
33 7021_6111
34 7030_7021
35 7018_7030
36 7017_7018
37 7388_7017
38 1907_1908
39 4489_4747
40 4747_1845
41 7159_7388
42 1908_1909
43 1713_1714
44 1909_1713
45 1714_1715
46 763_767
47 760_761
48 759_760
49 761_762
50 762_763
51 758_759
52 757_758
53 756_757
54 755_756
55 349_404
56 754_755
57 753_754
58 752_753
59 404_747
60 335_349
61 751_752
62 750_751
63 747_748
64 749_750
65 748_749
66 315_313
67 313_335
68 7158_7048
69 7048_7159


In [30]:
remove_pairs = pd.DataFrame(columns=['STARTSTOP', 'ENDSTOP'])

In [31]:
for i in remove.values():  
    df1 = start_end_pairs.loc[i, ['STARTSTOP', 'ENDSTOP']]
    df1 = pd.DataFrame(df1).T
    remove_pairs = pd.concat([remove_pairs, df1], ignore_index=True)
    
    start_end_pairs = start_end_pairs.drop(i)
    
remove_pairs

Unnamed: 0,STARTSTOP,ENDSTOP


In [32]:
if len(remove) > 0:
    start_end_pairs = start_end_pairs.reset_index()

### random forest

In [33]:
# store models for each start stop
models_RF = {}

# train model for each start stop
for i in range(start_end_pairs.shape[0]):
    pair = str(start_end_pairs.loc[i, 'STARTSTOP']) + '_' + str(start_end_pairs.loc[i, 'ENDSTOP'])
    print(i, pair)
    
    # got data for a certain start stop
    X_train = X_trainDict[pair]
    X_test = X_testDict[pair]
    y_train = y_trainDict[pair]
    y_test = y_testDict[pair]
    
    # train models
    rfc = RandomForestRegressor(n_estimators=64, max_features='auto', oob_score=True, random_state=1)
    rfc.fit(X_train, y_train)
    models_RF[pair] = rfc
    
    # pickle files
    filename = f'pickles_weather/{lineid}_{pair}_RF.sav'
    
    pickle.dump(rfc, open(filename, 'wb'))

0 1703_1905
1 1698_1699
2 1699_1700
3 1702_1703
4 1701_1702
5 1905_1906
6 1697_1698
7 1700_1701
8 1696_1697
9 1847_1696
10 7166_1847
11 1867_4489
12 1850_1867
13 1866_1850
14 1865_1866
15 1864_1865
16 1862_1863
17 1863_1864
18 1861_1862
19 4492_1861
20 1860_4492
21 1859_1860
22 1845_7166
23 1479_315
24 1478_1479
25 1858_1859
26 7453_1478
27 6112_6113
28 1906_1907
29 6111_6112
30 1715_7453
31 1898_1858
32 6113_1898
33 7021_6111
34 7030_7021
35 7018_7030
36 7017_7018
37 7388_7017
38 1907_1908
39 4489_4747
40 4747_1845
41 7159_7388
42 1908_1909
43 1713_1714
44 1909_1713
45 1714_1715
46 763_767
47 760_761
48 759_760
49 761_762
50 762_763
51 758_759
52 757_758
53 756_757
54 755_756
55 349_404
56 754_755
57 753_754
58 752_753
59 404_747
60 335_349
61 751_752
62 750_751
63 747_748
64 749_750
65 748_749
66 315_313
67 313_335
68 7158_7048
69 7048_7159


In [34]:
mae = {}
mape = {}
mse = {}
rmse = {}
r2 = {}

for i in range(start_end_pairs.shape[0]):
    pair = str(start_end_pairs.loc[i, 'STARTSTOP']) + '_' + str(start_end_pairs.loc[i, 'ENDSTOP'])
   
    X_train = X_trainDict[pair]
    y_train = y_trainDict[pair]
    rfc = models_RF[pair]
        
    rfc_predictions = rfc.predict(X_train).round()
    
    mae[pair] = metrics.mean_absolute_error(y_train, rfc_predictions)
    mape[pair] = metrics.mean_absolute_percentage_error(y_train, rfc_predictions)
    mse[pair] = metrics.mean_squared_error(y_train, rfc_predictions)
    rmse[pair] = metrics.mean_squared_error(y_train, rfc_predictions)**(0.5)
    r2[pair] = metrics.r2_score(y_train, rfc_predictions)

data = {'stop':list(mae.keys()), 'mae':list(mae.values()), 'mape':list(mape.values()), 
        'mse':list(mse.values()), 'rmse':list(rmse.values()), 'r2':list(r2.values())}

models_train_RF = pd.DataFrame.from_dict(data)
models_train_RF

Unnamed: 0,stop,mae,mape,mse,rmse,r2
0,1703_1905,6.925993,0.178680,78.623146,8.866969,0.559017
1,1698_1699,2.639627,0.119152,13.692514,3.700340,0.849479
2,1699_1700,7.756292,0.178569,95.407541,9.767678,0.598316
3,1702_1703,10.046863,0.285410,170.500827,13.057597,0.490883
4,1701_1702,4.830879,0.203637,39.191043,6.260275,0.608935
...,...,...,...,...,...,...
65,748_749,10.548960,0.184807,185.508553,13.620152,0.523814
66,315_313,11.790412,0.162659,260.128205,16.128490,0.685816
67,313_335,5.769900,0.043060,59.690299,7.725950,0.814111
68,7158_7048,3.502795,0.076088,22.506412,4.744092,0.746747


In [35]:
mae = {}
mape = {}
mse = {}
rmse = {}
r2 = {}

for i in range(start_end_pairs.shape[0]):
    pair = str(start_end_pairs.loc[i, 'STARTSTOP']) + '_' + str(start_end_pairs.loc[i, 'ENDSTOP'])
   
    X_test = X_testDict[pair]
    y_test = y_testDict[pair]
    rfc = models_RF[pair]
        
    rfc_predictions = rfc.predict(X_test)
    
    mae[pair] = metrics.mean_absolute_error(y_test, rfc_predictions)
    mape[pair] = metrics.mean_absolute_percentage_error(y_test, rfc_predictions)
    mse[pair] = metrics.mean_squared_error(y_test, rfc_predictions)
    rmse[pair] = metrics.mean_squared_error(y_test, rfc_predictions)**(0.5)
    r2[pair] = metrics.r2_score(y_test, rfc_predictions)

data = {'stop':list(mae.keys()), 'mae':list(mae.values()), 'mape':list(mape.values()), 
        'mse':list(mse.values()), 'rmse':list(rmse.values()), 'r2':list(r2.values())}

models_test_RF = pd.DataFrame.from_dict(data)
models_test_RF

Unnamed: 0,stop,mae,mape,mse,rmse,r2
0,1703_1905,10.583845,0.272415,172.636902,13.139136,0.037238
1,1698_1699,4.820014,0.224070,41.202570,6.418923,0.560408
2,1699_1700,12.821023,0.301370,243.660374,15.609624,-0.028075
3,1702_1703,15.165233,0.437641,368.568985,19.198151,-0.084793
4,1701_1702,7.699467,0.328199,90.456668,9.510871,0.099263
...,...,...,...,...,...,...
65,748_749,15.811215,0.275105,398.194132,19.954802,0.006145
66,315_313,23.071995,0.331750,842.018182,29.017550,-0.079745
67,313_335,12.666594,0.095557,259.316497,16.103307,0.220523
68,7158_7048,7.865801,0.170788,101.339672,10.066761,-0.101105


In [36]:
models_train_RF[['mape', 'r2']].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mape,70.0,0.171549,0.07696,0.038138,0.116062,0.168422,0.206172,0.395142
r2,70.0,0.586795,0.113133,0.404316,0.501679,0.57856,0.649737,0.866061


In [37]:
models_test_RF[['mape', 'r2']].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mape,70.0,0.275285,0.118354,0.091126,0.185742,0.268234,0.328322,0.614738
r2,70.0,0.028431,0.174893,-0.251089,-0.087755,-0.011841,0.120247,0.560408


In [38]:
models_train_RF['lineid'] = lineid
models_test_RF['lineid'] = lineid

In [39]:
models_train_RF.to_csv(f'result/{lineid}_train_weather.csv', index = False)
models_test_RF.to_csv(f'result/{lineid}_test_weather.csv', index = False)

In [40]:
remove_pairs.to_csv(f'result/{lineid}_remove_weather.csv', index = False)