In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [93]:
train_data_initial = pd.read_csv('flight_delays_train.csv')
test_data_initial = pd.read_csv('flight_delays_test.csv')
test_data_initial['dep_delayed_15min'] = np.NaN
data_initial = pd.concat([train_data_initial, test_data_initial])
train_indx = train_data_initial.shape[0]


In [94]:
print(data_initial.shape)
data_initial.head()

(200000, 9)


Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [95]:
def time_transform(df: pd.DataFrame) -> pd.DataFrame:
    
    time_cols = ['Month', 'DayofMonth', 'DayOfWeek']
    for col in time_cols:
        df[col] = df[col].str[2:].astype('int')
    
    df['DepTime_h'] = df['DepTime'] // 100
#     df['DepTime_m'] = df['DepTime'] % 100
#     df = df.drop(columns=['DepTime'])
    
    return df



In [96]:
data = time_transform(data_initial)

data.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min,DepTime_h
0,8,21,7,1934,AA,ATL,DFW,732,N,19
1,4,20,3,1548,US,PIT,MCO,834,N,15
2,9,2,5,1422,XE,RDU,CLE,416,N,14
3,11,25,6,1015,OO,DEN,MEM,872,N,10
4,10,7,6,1828,WN,MDW,OMA,423,Y,18


In [97]:
data['dep_delayed_15min'] = data['dep_delayed_15min'].map({'N': 0, 'Y': 1})
data['flight'] = data['Origin']+data['Dest']
data = data.drop(columns=['Origin','Dest'])

In [98]:
data.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Distance,dep_delayed_15min,DepTime_h,flight
0,8,21,7,1934,AA,732,0.0,19,ATLDFW
1,4,20,3,1548,US,834,0.0,15,PITMCO
2,9,2,5,1422,XE,416,0.0,14,RDUCLE
3,11,25,6,1015,OO,872,0.0,10,DENMEM
4,10,7,6,1828,WN,423,1.0,18,MDWOMA


In [99]:
def transform_categ_mean(df: pd.DataFrame) -> pd.DataFrame:
    
    df_categ = df[['UniqueCarrier', 'flight','dep_delayed_15min']]
    
    for col in ['UniqueCarrier', 'flight']:
        
        av_mean = df_categ.groupby(col).agg({'dep_delayed_15min':'mean'})
        av_mean=av_mean.fillna(0)
        av_mean_dict = dict(av_mean['dep_delayed_15min'], index=av_mean.index)
        df[col] = df[col].map(av_mean_dict).astype('float64')
    
    return df

In [100]:
#df_categ = data[['UniqueCarrier', 'flight','dep_delayed_15min']]


In [101]:
# av_mean=df_categ.groupby(by=['UniqueCarrier']).agg({'dep_delayed_15min':'mean'})
# av_mean=av_mean.fillna(0)
# av_mean.head()

In [102]:
# av_mean_dict = dict(av_mean['dep_delayed_15min'], index=av_mean.index)
# av_mean_dict

In [103]:
# data['New'] = data['UniqueCarrier'].map(av_mean_dict)
# data.head()

In [104]:
    
#     for col in ['UniqueCarrier', 'flight']:
        
#         av_mean = df_categ.groupby(col).agg({'dep_delayed_15min':'mean'})
#         av_mean_dict = dict(av_mean)
#         df[col] = df[col].map(av_mean_dict)

In [105]:
data_processed = transform_categ_mean(data)
print(data_processed.shape)
data_processed.head()

(200000, 9)


Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Distance,dep_delayed_15min,DepTime_h,flight
0,8,21,7,1934,0.187938,732,0.0,19,0.235294
1,4,20,3,1548,0.167387,834,0.0,15,0.166667
2,9,2,5,1422,0.173869,416,0.0,14,0.090909
3,11,25,6,1015,0.172801,872,0.0,10,0.142857
4,10,7,6,1828,0.213433,423,1.0,18,0.466667


In [106]:
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder


In [107]:
def transform_categs(df: pd.DataFrame) -> pd.DataFrame:
    
    OHE = OneHotEncoder(sparse=False)
    
    df_categ = df[['UniqueCarrier', 'flight']].astype('category')
    
    array_transformed = OHE.fit_transform(df_categ)
    array_columns = OHE.get_feature_names_out(['UniqueCarrier', 'flight'])
    df_transformed = pd.DataFrame(data=array_transformed,columns=array_columns)
    
    df = df.drop(columns=['UniqueCarrier', 'flight'])
    
    df = df.reset_index()
    df_transformed = df_transformed.reset_index()
    df = df.drop(columns=['index'])
    df_transformed = df_transformed.drop(columns=['index'])
    
    
    df = pd.concat([df, df_transformed], axis=1)
    
    return df

In [108]:
# data_processed = transform_categs(data)
# data_processed.shape

In [109]:
from sklearn.preprocessing import StandardScaler


def scale_time(df: pd.DataFrame) -> pd.DataFrame:
    
    scaler = StandardScaler()
    
    df_time = df[['Month', 'DayofMonth', 'DayOfWeek','DepTime','Distance','DepTime_h']]
    
    array_transformed = scaler.fit_transform(df_time)
    array_columns = scaler.get_feature_names_out(['Month', 'DayofMonth', 'DayOfWeek','DepTime','Distance','DepTime_h'])
    df_transformed = pd.DataFrame(data=array_transformed,columns=array_columns)
    
    df = df.drop(columns=['Month', 'DayofMonth', 'DayOfWeek','DepTime','Distance','DepTime_h'])
    
    df = df.reset_index()
    df_transformed = df_transformed.reset_index()
    df = df.drop(columns=['index'])
    df_transformed = df_transformed.drop(columns=['index'])
    
    
    df = pd.concat([df, df_transformed], axis=1)
    
    return df





In [110]:
#data_processed = scale_time(data_processed_categ)
#data_processed.shape

In [111]:
from xgboost import XGBClassifier, plot_importance
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV

In [112]:
train_data = data_processed[:train_indx]
train_data_y = train_data['dep_delayed_15min']
train_data_x = train_data.drop(columns=['dep_delayed_15min'])

test_data = data_processed[train_indx:]
test_data_x = test_data.drop(columns=['dep_delayed_15min'])

train_data.shape, test_data.shape

((100000, 9), (100000, 9))

In [113]:
X_train, X_hold, y_train, y_hold = train_test_split(
    train_data_x,
    train_data_y,
    test_size=0.3,
    stratify=train_data_y,
    random_state=20
)

In [114]:
model = XGBClassifier(n_estimators = 50, random_state = 20)

In [276]:
params = {'n_estimators': [1000],
          'max_depth': [5, 7,10],
          'learning_rate':[0.1],
          'gamma': [3],
          'colsample_bytree':[0.3],
          'max_leaves': [5,7,10]
          
          }

grid_search = GridSearchCV(estimator=XGBClassifier(random_state = 20), param_grid=params, cv=3, scoring='roc_auc', n_jobs=-1)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None,...
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                   

In [277]:
grid_search.best_params_, grid_search.best_score_

({'colsample_bytree': 0.3,
  'gamma': 3,
  'learning_rate': 0.1,
  'max_depth': 5,
  'max_leaves': 5,
  'n_estimators': 1000},
 0.7714351986672572)

In [278]:
best_model_first = grid_search.best_estimator_

In [279]:
y_pred = best_model_first.predict_proba(X_train)[:, 1]
roc_auc_score(y_train, y_pred)

0.8037994280684874

In [280]:
y_pred = best_model_first.predict_proba(X_hold)[:, 1]
roc_auc_score(y_hold, y_pred)

0.7788975395900031

In [269]:
XGBClassifier?

In [281]:
y_pred = best_model_first.predict_proba(test_data_x)[:, 1]
result = pd.DataFrame(data={'dep_delayed_15min': y_pred})
result.to_csv('predict6.csv', header=True, index_label='id')

In [118]:
df_feat= pd.DataFrame(data={'feat_name':best_model_first.feature_names_in_, 'importance': best_model_first.feature_importances_}).\
    sort_values(by='importance', ascending= False)
df_feat['cum']= df_feat['importance'].cumsum()

In [119]:
usless_feats = df_feat[df_feat['cum']>0.999]['feat_name']
df_feat[df_feat['cum']>0.999].shape, df_feat.shape

((2, 3), (8, 3))

In [None]:
from hyperopt import Trails

In [None]:
trails = Trails()

In [None]:
def model_func():
    

    
def optimize(model_func):
    
    space={
        'num_rounds' = 100
        'max_depth'= hp.uni,
        'learning_rate'=,
        'gamma'=,
        'colsample_bytree'=,
        'eval_metric'='roc_auc'
        
        
    }
    
    best = fmin(model_func,space,trail=trails)
    

In [120]:
X_train_opt = X_train.drop(columns=usless_feats)
X_hold_opt = X_hold.drop(columns=usless_feats)
train_data_x_opt = train_data_x.drop(columns=usless_feats)
test_data_x_opt = test_data_x.drop(columns=usless_feats)

In [121]:
#params = {'n_estimators': [90], 'max_depth': [7], 'learning_rate': [0.3]}

#grid_search = GridSearchCV(estimator=XGBClassifier(random_state = 20), param_grid=params, cv=3, scoring='roc_auc', n_jobs=-1)

#grid_search.fit(X_train_opt, y_train)

In [122]:
#grid_search.best_params_, grid_search.best_score_

In [123]:
#XGBClassifier?

In [170]:
%time
model = XGBClassifier(n_estimators=1000,max_depth=50,learning_rate=1, random_state=20)
model.fit(X_train_opt, y_train)

Wall time: 0 ns


XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=50, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=1000, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=20, ...)

In [171]:
y_pred = model.predict_proba(X_train_opt)[:, 1]
roc_auc_score(y_train, y_pred)

0.9999999993381468

In [172]:
y_pred = model.predict_proba(X_hold_opt)[:, 1]
roc_auc_score(y_hold, y_pred)

0.7192015025754905

In [169]:
y_pred = model.predict_proba(train_data_x_opt)[:, 1]
roc_auc_score(train_data_y, y_pred)

0.9356748777777684

In [161]:
y_pred = model.predict_proba(test_data_x_opt)[:, 1]

In [148]:
result = pd.DataFrame(data={'dep_delayed_15min': y_pred})
result.to_csv('predict6.csv', header=True, index_label='id')