In [92]:
import warnings
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier

**Read the data**

In [93]:
PATH_TO_DATA = Path('./data/')

In [94]:
train_df = pd.read_csv(PATH_TO_DATA / 'flight_delays_train.csv')

In [95]:
train_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [96]:
test_df = pd.read_csv(PATH_TO_DATA / 'flight_delays_test.csv')

In [97]:
test_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-7,c-25,c-3,615,YV,MRY,PHX,598
1,c-4,c-17,c-2,739,WN,LAS,HOU,1235
2,c-12,c-2,c-7,651,MQ,GSP,ORD,577
3,c-3,c-25,c-7,1614,WN,BWI,MHT,377
4,c-6,c-6,c-3,1505,UA,ORD,STL,258


**Create only one feature - “flight” (this you need to improve - add more features)**

In [98]:
def add_features(df):
    season_dict = {'c-12': 'winter','c-1': 'winter','c-2': 'winter',
                   'c-3': 'spring','c-4': 'spring','c-5': 'spring',
                   'c-6': 'summer','c-7': 'summer','c-8': 'summer',
                   'c-9': 'autumn','c-10': 'autumn','c-11': 'autumn'}
    
    df['flight'] = df['Origin'] + '-->' + df['Dest']
    df['season'] = df['Month'].map(season_dict).astype('object')
    
    df['DepTimeH'] = df['DepTime'].apply(lambda x: x//100).astype('object')
    df.loc[df['DepTimeH'] == 24, 'DepTimeH'] = 0
    df.loc[df['DepTimeH'] == 25, 'DepTimeH'] = 1
    df['DepTimeH'] = df['DepTimeH'].astype('object')
    
    #df['daytime'] = pd.cut(train_df['DepTimeH'], bins=[0, 6, 12, 18, 23], include_lowest=True)
    #df['daytime'] = df['daytime'].astype('object')

In [99]:
def drop_features(df):
    df = df.drop(['Month'], axis=1)
    df = df.drop(['DayofMonth'], axis=1)

In [100]:
add_features(train_df)
add_features(test_df)

In [101]:
train_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min,flight,season,DepTimeH
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N,ATL-->DFW,summer,19
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N,PIT-->MCO,spring,15
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N,RDU-->CLE,autumn,14
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N,DEN-->MEM,autumn,10
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y,MDW-->OMA,autumn,18


In [102]:
X_train = train_df.drop('dep_delayed_15min', axis=1).values
y_train = train_df['dep_delayed_15min'].map({'Y': 1, 'N': 0}).values
X_test = test_df.values

**Allocate a hold-out set (a.k.a. a validation set) to validate the model**

In [103]:
X_train_part, X_valid, y_train_part, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=17)

**Remember indexes of categorical features (to be passed to CatBoost)**

In [104]:
train_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min,flight,season,DepTimeH
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N,ATL-->DFW,summer,19
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N,PIT-->MCO,spring,15
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N,RDU-->CLE,autumn,14
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N,DEN-->MEM,autumn,10
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y,MDW-->OMA,autumn,18


In [105]:
categ_feat_idx = np.where(train_df.drop('dep_delayed_15min', axis=1).dtypes == 'object')[0]
print(categ_feat_idx)

[ 0  1  2  4  5  6  8  9 10]


**Train Catboost with default arguments, passing only the indexes of categorical features.**

In [106]:
params = {'eval_metric':'AUC',
          'cat_features': categ_feat_idx,
          'verbose': 200,
          'random_seed': 17,
          'task_type': 'GPU',
          'border_count': 32,
         }
#'early_stopping_rounds': 200

In [108]:
#ctb = CatBoostClassifier(eval_metric='AUC', random_seed=17, task_type='GPU', border_count=32)

In [109]:
%%time
#ctb.fit(X_train_part, y_train_part, eval_set=(X_valid, y_valid), use_best_model=True, plot=True,
#       verbose=200, cat_features=categ_feat_idx)
#ctb.fit(X_train_part, y_train_part, cat_features=categ_feat_idx)
#get_feature_importance(prettified=True)

Learning rate set to 0.055755


CatboostError: catboost/cuda/cuda_lib/cuda_base.h:268: CUDA error 35: CUDA driver version is insufficient for CUDA runtime version

In [81]:
#ctb_valid_pred = ctb.predict_proba(X_valid)[:, 1]

CatboostError: There is no trained model to use predict(). Use fit() to train model. Then use predict().

**We got some 0.756 ROC AUC on the hold-out set.**

In [16]:
#roc_auc_score(y_valid, ctb_valid_pred)

**Train on the whole train set, make prediction on the test set.**

In [87]:
%%time
#ctb.fit(X_train, y_train, use_best_model=True, plot=True);

TypeError: must be real number, not pandas._libs.interval.Interval

In [18]:
#ctb_test_pred = ctb.predict_proba(X_test)[:, 1]

In [19]:
'''with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    sample_sub = pd.read_csv(PATH_TO_DATA / 'sample_submission.csv', 
                             index_col='id')
    sample_sub['dep_delayed_15min'] = ctb_test_pred
    sample_sub.to_csv('ctb_pred.csv')'''

In [20]:
#!head ctb_pred.csv

id,dep_delayed_15min
0,0.024191777928279132
1,0.0308209569759969
2,0.03220135003776402
3,0.2827188832526749
4,0.3591878705009816
5,0.07145586111285211
6,0.08524587587349775
7,0.3106740326344117
8,0.21237373000575396


Now's your turn! Go and improve the model to beat **"A2 baseline (10 credits)"** - **0.75914** LB score. It's crucial to come up with some good features. 

For discussions, stick to the **#a2_kaggle_fall2019** thread in the **mlcourse_ai_news** [ODS Slack](http://opendatascience.slack.com) channel. Serhii Romanenko (@serhii_romanenko) will be there to help. 

Welcome to Kaggle!

<img src='https://habrastorage.org/webt/fs/42/ms/fs42ms0r7qsoj-da4x7yfntwrbq.jpeg' width=50%>
*from the ["Nerd Laughing Loud"](https://www.kaggle.com/general/76963) thread.*