In [1]:
!pip install catboost



distributed 1.21.8 requires msgpack, which is not installed.
You are using pip version 10.0.1, however version 19.0.3 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from catboost import CatBoostClassifier

### Extracting data

In [160]:
train_df = pd.read_csv('data/flight_delays_train.csv')
test_df = pd.read_csv('data/flight_delays_test.csv')

In [161]:
idx_split = train_df.shape[0]
full_df = pd.concat([train_df.drop('dep_delayed_15min', axis=1), test_df])

### Data preprocessing

In [162]:
full_df['Month'] = full_df['Month'].apply(lambda month: month[2:]).astype('int64')
full_df['DayofMonth'] = full_df['DayofMonth'].apply(lambda day: day[2:]).astype('int64')
full_df['DayOfWeek'] = full_df['DayOfWeek'].apply(lambda day: day[2:]).astype('int64')

### Feature engineering

In [163]:
new_df = full_df
#new_df['Route'] = new_df['Origin'] + '-' + new_df['Dest']
new_df['DepHour'] = new_df['DepTime'] // 100

new_df['morning'] = ((new_df['DepHour'] >= 7) & (new_df['DepHour'] < 12)).astype('int64')
new_df['day'] = ((new_df['DepHour'] >= 12) & (new_df['DepHour'] < 17)).astype('int64')
new_df['evening'] = ((new_df['DepHour'] >= 17) & (new_df['DepHour'] < 24)).astype('int64')
new_df['night'] = ((new_df['DepHour'] >= 0) & (new_df['DepHour'] < 7)).astype('int64')

new_df['winter'] = ((new_df['Month'] >= 12) | (new_df['Month'] <= 2)).astype('int64')
new_df['spring'] = ((new_df['Month'] >= 3) & (new_df['Month'] <= 5)).astype('int64')
new_df['summer'] = ((new_df['Month'] >= 6) & (new_df['Month'] <= 8)).astype('int64')
new_df['autumn'] = ((new_df['Month'] >= 9) & (new_df['Month'] <= 11)).astype('int64')
#new_df = new_df.drop(['DepTime'], axis=1)
full_df = new_df

In [164]:
X_train = full_df[:idx_split]
X_test = full_df[idx_split:]
y_train = train_df['dep_delayed_15min'].map({'Y' : 1, 'N' : 0})

full_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,DepHour,morning,day,evening,night,winter,spring,summer,autumn
0,8,21,7,1934,AA,ATL,DFW,732,19,0,0,1,0,0,0,1,0
1,4,20,3,1548,US,PIT,MCO,834,15,0,1,0,0,0,1,0,0
2,9,2,5,1422,XE,RDU,CLE,416,14,0,1,0,0,0,0,0,1
3,11,25,6,1015,OO,DEN,MEM,872,10,1,0,0,0,0,0,0,1
4,10,7,6,1828,WN,MDW,OMA,423,18,0,0,1,0,0,0,0,1


### Split for checking our model on holdout dataset

In [165]:
X_train_part, X_valid, y_train_part, y_valid = \
    train_test_split(X_train, y_train, test_size=0.3, random_state=17)

### Create our model and test it. 

In [169]:
model = CatBoostClassifier(n_estimators=1000)

cat_features = [6, 4, 5, 8]

model.fit(X_train_part, y_train_part, cat_features)
cat_pred = model.predict_proba(X_valid)[:, 1]

In [168]:
roc_auc_score(y_valid, cat_pred)

0.7389732026358325

In [105]:
#best: 0.7596488990192972

In [132]:
pd.concat([pd.DataFrame(model.feature_importances_), pd.DataFrame(full_df.columns)], axis=1)

Unnamed: 0,0,0.1
0,6.033978,Month
1,6.944982,DayofMonth
2,4.59022,DayOfWeek
3,10.218987,UniqueCarrier
4,13.87117,Origin
5,12.770519,Dest
6,6.011408,Distance
7,30.847851,DepHour
8,0.197117,morning
9,1.243581,day


### Well, now let's do final prediction and load it to file.csv

In [170]:
model.fit(X_train, y_train, cat_features)
cat_test_pred = model.predict_proba(X_test)[:, 1]
pd.Series(cat_test_pred, name='dep_delayed_15min').to_csv('catboost_1000_with_deptime.csv', index_label='id', header=True)

Learning rate set to 0.061677
0:	learn: 0.6603251	total: 272ms	remaining: 4m 31s
1:	learn: 0.6309649	total: 528ms	remaining: 4m 23s
2:	learn: 0.6062341	total: 724ms	remaining: 4m
3:	learn: 0.5838177	total: 984ms	remaining: 4m 4s
4:	learn: 0.5666185	total: 1.12s	remaining: 3m 42s
5:	learn: 0.5512757	total: 1.31s	remaining: 3m 37s
6:	learn: 0.5365018	total: 1.54s	remaining: 3m 38s
7:	learn: 0.5254396	total: 1.71s	remaining: 3m 32s
8:	learn: 0.5150973	total: 1.92s	remaining: 3m 31s
9:	learn: 0.5064031	total: 2.12s	remaining: 3m 30s
10:	learn: 0.4982096	total: 2.31s	remaining: 3m 28s
11:	learn: 0.4914585	total: 2.5s	remaining: 3m 25s
12:	learn: 0.4861398	total: 2.71s	remaining: 3m 25s
13:	learn: 0.4817516	total: 2.91s	remaining: 3m 24s
14:	learn: 0.4769846	total: 3.1s	remaining: 3m 23s
15:	learn: 0.4727935	total: 3.38s	remaining: 3m 27s
16:	learn: 0.4695170	total: 3.64s	remaining: 3m 30s
17:	learn: 0.4663510	total: 3.89s	remaining: 3m 32s
18:	learn: 0.4636816	total: 4.09s	remaining: 3m 31s

159:	learn: 0.4080105	total: 32.5s	remaining: 2m 50s
160:	learn: 0.4079458	total: 32.7s	remaining: 2m 50s
161:	learn: 0.4078873	total: 33s	remaining: 2m 50s
162:	learn: 0.4077483	total: 33.2s	remaining: 2m 50s
163:	learn: 0.4076877	total: 33.4s	remaining: 2m 50s
164:	learn: 0.4076050	total: 33.6s	remaining: 2m 50s
165:	learn: 0.4075318	total: 33.8s	remaining: 2m 50s
166:	learn: 0.4074498	total: 34.1s	remaining: 2m 49s
167:	learn: 0.4073651	total: 34.3s	remaining: 2m 49s
168:	learn: 0.4073074	total: 34.5s	remaining: 2m 49s
169:	learn: 0.4071977	total: 34.7s	remaining: 2m 49s
170:	learn: 0.4071554	total: 34.9s	remaining: 2m 49s
171:	learn: 0.4070659	total: 35.1s	remaining: 2m 48s
172:	learn: 0.4069518	total: 35.3s	remaining: 2m 48s
173:	learn: 0.4068572	total: 35.5s	remaining: 2m 48s
174:	learn: 0.4067957	total: 35.7s	remaining: 2m 48s
175:	learn: 0.4066974	total: 35.9s	remaining: 2m 48s
176:	learn: 0.4066350	total: 36.2s	remaining: 2m 48s
177:	learn: 0.4065777	total: 36.4s	remaining: 2m

315:	learn: 0.3978016	total: 1m 10s	remaining: 2m 33s
316:	learn: 0.3977647	total: 1m 11s	remaining: 2m 33s
317:	learn: 0.3977300	total: 1m 11s	remaining: 2m 32s
318:	learn: 0.3977026	total: 1m 11s	remaining: 2m 32s
319:	learn: 0.3976528	total: 1m 11s	remaining: 2m 32s
320:	learn: 0.3975938	total: 1m 11s	remaining: 2m 32s
321:	learn: 0.3975467	total: 1m 12s	remaining: 2m 32s
322:	learn: 0.3975080	total: 1m 12s	remaining: 2m 31s
323:	learn: 0.3974207	total: 1m 12s	remaining: 2m 31s
324:	learn: 0.3973555	total: 1m 12s	remaining: 2m 31s
325:	learn: 0.3973066	total: 1m 13s	remaining: 2m 31s
326:	learn: 0.3972760	total: 1m 13s	remaining: 2m 31s
327:	learn: 0.3972279	total: 1m 13s	remaining: 2m 31s
328:	learn: 0.3971776	total: 1m 13s	remaining: 2m 30s
329:	learn: 0.3971334	total: 1m 14s	remaining: 2m 30s
330:	learn: 0.3970878	total: 1m 14s	remaining: 2m 30s
331:	learn: 0.3970552	total: 1m 14s	remaining: 2m 30s
332:	learn: 0.3970246	total: 1m 15s	remaining: 2m 30s
333:	learn: 0.3970021	total:

468:	learn: 0.3913621	total: 1m 48s	remaining: 2m 2s
469:	learn: 0.3913120	total: 1m 48s	remaining: 2m 2s
470:	learn: 0.3912807	total: 1m 48s	remaining: 2m 2s
471:	learn: 0.3912254	total: 1m 49s	remaining: 2m 1s
472:	learn: 0.3911967	total: 1m 49s	remaining: 2m 1s
473:	learn: 0.3911643	total: 1m 49s	remaining: 2m 1s
474:	learn: 0.3911225	total: 1m 49s	remaining: 2m 1s
475:	learn: 0.3910846	total: 1m 50s	remaining: 2m 1s
476:	learn: 0.3910210	total: 1m 50s	remaining: 2m 1s
477:	learn: 0.3909658	total: 1m 50s	remaining: 2m
478:	learn: 0.3909127	total: 1m 50s	remaining: 2m
479:	learn: 0.3908693	total: 1m 51s	remaining: 2m
480:	learn: 0.3908134	total: 1m 51s	remaining: 2m
481:	learn: 0.3907733	total: 1m 51s	remaining: 1m 59s
482:	learn: 0.3907177	total: 1m 51s	remaining: 1m 59s
483:	learn: 0.3906670	total: 1m 52s	remaining: 1m 59s
484:	learn: 0.3906219	total: 1m 52s	remaining: 1m 59s
485:	learn: 0.3905909	total: 1m 52s	remaining: 1m 59s
486:	learn: 0.3905592	total: 1m 52s	remaining: 1m 59s

622:	learn: 0.3862009	total: 2m 17s	remaining: 1m 23s
623:	learn: 0.3861788	total: 2m 18s	remaining: 1m 23s
624:	learn: 0.3861510	total: 2m 18s	remaining: 1m 22s
625:	learn: 0.3861213	total: 2m 18s	remaining: 1m 22s
626:	learn: 0.3860933	total: 2m 18s	remaining: 1m 22s
627:	learn: 0.3860679	total: 2m 19s	remaining: 1m 22s
628:	learn: 0.3860306	total: 2m 19s	remaining: 1m 22s
629:	learn: 0.3860018	total: 2m 19s	remaining: 1m 22s
630:	learn: 0.3859655	total: 2m 19s	remaining: 1m 21s
631:	learn: 0.3859313	total: 2m 20s	remaining: 1m 21s
632:	learn: 0.3859126	total: 2m 20s	remaining: 1m 21s
633:	learn: 0.3858643	total: 2m 20s	remaining: 1m 21s
634:	learn: 0.3858408	total: 2m 21s	remaining: 1m 21s
635:	learn: 0.3858097	total: 2m 21s	remaining: 1m 20s
636:	learn: 0.3857765	total: 2m 21s	remaining: 1m 20s
637:	learn: 0.3857476	total: 2m 22s	remaining: 1m 20s
638:	learn: 0.3857204	total: 2m 22s	remaining: 1m 20s
639:	learn: 0.3856822	total: 2m 22s	remaining: 1m 20s
640:	learn: 0.3856705	total:

776:	learn: 0.3814544	total: 2m 54s	remaining: 50s
777:	learn: 0.3814248	total: 2m 54s	remaining: 49.8s
778:	learn: 0.3814123	total: 2m 54s	remaining: 49.5s
779:	learn: 0.3813866	total: 2m 54s	remaining: 49.3s
780:	learn: 0.3813557	total: 2m 55s	remaining: 49.1s
781:	learn: 0.3813306	total: 2m 55s	remaining: 48.9s
782:	learn: 0.3813083	total: 2m 55s	remaining: 48.6s
783:	learn: 0.3812638	total: 2m 55s	remaining: 48.4s
784:	learn: 0.3812225	total: 2m 56s	remaining: 48.2s
785:	learn: 0.3811778	total: 2m 56s	remaining: 48s
786:	learn: 0.3811515	total: 2m 56s	remaining: 47.8s
787:	learn: 0.3811186	total: 2m 57s	remaining: 47.6s
788:	learn: 0.3810846	total: 2m 57s	remaining: 47.4s
789:	learn: 0.3810435	total: 2m 57s	remaining: 47.2s
790:	learn: 0.3809986	total: 2m 57s	remaining: 47s
791:	learn: 0.3809751	total: 2m 58s	remaining: 46.7s
792:	learn: 0.3809587	total: 2m 58s	remaining: 46.5s
793:	learn: 0.3809339	total: 2m 58s	remaining: 46.3s
794:	learn: 0.3809049	total: 2m 58s	remaining: 46.1s

933:	learn: 0.3769961	total: 3m 24s	remaining: 14.5s
934:	learn: 0.3769589	total: 3m 25s	remaining: 14.3s
935:	learn: 0.3769350	total: 3m 25s	remaining: 14s
936:	learn: 0.3769144	total: 3m 25s	remaining: 13.8s
937:	learn: 0.3768654	total: 3m 25s	remaining: 13.6s
938:	learn: 0.3768442	total: 3m 25s	remaining: 13.4s
939:	learn: 0.3768148	total: 3m 25s	remaining: 13.1s
940:	learn: 0.3767905	total: 3m 26s	remaining: 12.9s
941:	learn: 0.3767632	total: 3m 26s	remaining: 12.7s
942:	learn: 0.3767310	total: 3m 26s	remaining: 12.5s
943:	learn: 0.3767097	total: 3m 26s	remaining: 12.3s
944:	learn: 0.3766857	total: 3m 27s	remaining: 12s
945:	learn: 0.3766596	total: 3m 27s	remaining: 11.8s
946:	learn: 0.3766376	total: 3m 27s	remaining: 11.6s
947:	learn: 0.3766033	total: 3m 27s	remaining: 11.4s
948:	learn: 0.3765807	total: 3m 27s	remaining: 11.2s
949:	learn: 0.3765510	total: 3m 28s	remaining: 11s
950:	learn: 0.3765231	total: 3m 28s	remaining: 10.7s
951:	learn: 0.3764823	total: 3m 28s	remaining: 10.5s