In [285]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [286]:
train_data_initial = pd.read_csv('flight_delays_train.csv')
test_data_initial = pd.read_csv('flight_delays_test.csv')
test_data_initial['dep_delayed_15min'] = np.NaN
data_initial = pd.concat([train_data_initial, test_data_initial])
train_indx = train_data_initial.shape[0]


In [287]:
print(data_initial.shape)
data_initial.head()

(200000, 9)


Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [288]:
def time_transform(df: pd.DataFrame) -> pd.DataFrame:
    
    time_cols = ['Month', 'DayofMonth', 'DayOfWeek']
    for col in time_cols:
        df[col] = df[col].str[2:].astype('int')
    
    df['DepTime_h'] = df['DepTime'] // 100
#     df['DepTime_m'] = df['DepTime'] % 100
#     df = df.drop(columns=['DepTime'])
    
    return df



In [289]:
data = time_transform(data_initial)

data.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min,DepTime_h
0,8,21,7,1934,AA,ATL,DFW,732,N,19
1,4,20,3,1548,US,PIT,MCO,834,N,15
2,9,2,5,1422,XE,RDU,CLE,416,N,14
3,11,25,6,1015,OO,DEN,MEM,872,N,10
4,10,7,6,1828,WN,MDW,OMA,423,Y,18


In [290]:
data['dep_delayed_15min'] = data['dep_delayed_15min'].map({'N': 0, 'Y': 1})
data['flight'] = data['Origin']+data['Dest']
data = data.drop(columns=['Origin','Dest'])

In [291]:
#data.head()

In [292]:
#не использую больше
def transform_categ_mean(df: pd.DataFrame) -> pd.DataFrame:
    
    df_categ = df[['UniqueCarrier', 'flight','dep_delayed_15min']]
    
    for col in ['UniqueCarrier', 'flight']:
        
        av_mean = df_categ.groupby(col).agg({'dep_delayed_15min':'mean'})
        av_mean=av_mean.fillna(0)
        av_mean_dict = dict(av_mean['dep_delayed_15min'], index=av_mean.index)
        df[col] = df[col].map(av_mean_dict).astype('float64')
    
    return df

In [293]:
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder


In [294]:
#не использую больше
def transform_categs(df: pd.DataFrame) -> pd.DataFrame:
    
    OHE = OneHotEncoder(sparse=False)
    
    df_categ = df[['UniqueCarrier', 'flight']].astype('category')
    
    array_transformed = OHE.fit_transform(df_categ)
    array_columns = OHE.get_feature_names_out(['UniqueCarrier', 'flight'])
    df_transformed = pd.DataFrame(data=array_transformed,columns=array_columns)
    
    df = df.drop(columns=['UniqueCarrier', 'flight'])
    
    df = df.reset_index()
    df_transformed = df_transformed.reset_index()
    df = df.drop(columns=['index'])
    df_transformed = df_transformed.drop(columns=['index'])
    
    
    df = pd.concat([df, df_transformed], axis=1)
    
    return df

In [295]:
data_processed = transform_categs(data)
# data_processed = transform_categ_mean(data)
# print(data_processed.shape)
# data_processed.head()


In [296]:
from sklearn.preprocessing import StandardScaler

#не использую больше
def scale_time(df: pd.DataFrame) -> pd.DataFrame:
    
    scaler = StandardScaler()
    
    df_time = df[['Month', 'DayofMonth', 'DayOfWeek','DepTime','Distance','DepTime_h']]
    
    array_transformed = scaler.fit_transform(df_time)
    array_columns = scaler.get_feature_names_out(['Month', 'DayofMonth', 'DayOfWeek','DepTime','Distance','DepTime_h'])
    df_transformed = pd.DataFrame(data=array_transformed,columns=array_columns)
    
    df = df.drop(columns=['Month', 'DayofMonth', 'DayOfWeek','DepTime','Distance','DepTime_h'])
    
    df = df.reset_index()
    df_transformed = df_transformed.reset_index()
    df = df.drop(columns=['index'])
    df_transformed = df_transformed.drop(columns=['index'])
    
    
    df = pd.concat([df, df_transformed], axis=1)
    
    return df





In [297]:
#data_processed = scale_time(data_processed_categ)
#data_processed.shape

In [298]:
from xgboost import XGBClassifier, plot_importance
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV

In [299]:
train_data = data_processed[:train_indx]
train_data_y = train_data['dep_delayed_15min']
train_data_x = train_data.drop(columns=['dep_delayed_15min'])

test_data = data_processed[train_indx:]
test_data_x = test_data.drop(columns=['dep_delayed_15min'])

train_data.shape, test_data.shape

((100000, 5078), (100000, 5078))

In [300]:
X_train, X_hold, y_train, y_hold = train_test_split(
    train_data_x,
    train_data_y,
    test_size=0.3,
    stratify=train_data_y,
    random_state=20
)

In [301]:
model = XGBClassifier(n_estimators = 50, random_state = 20)

In [None]:
params = {'n_estimators': [50],
          'max_depth': [7],
          'learning_rate':[0.1],
          'gamma': [3],
          'colsample_bytree':[0.3],
          'max_leaves': [5]
          
          }

grid_search = GridSearchCV(estimator=XGBClassifier(random_state = 20, verbose=0), param_grid=params, cv=3, scoring='roc_auc', n_jobs=-1)

grid_search.fit(X_train, y_train)

In [None]:
#grid_search.best_params_, grid_search.best_score_

In [None]:
best_model_first = grid_search.best_estimator_

In [None]:
y_pred = best_model_first.predict_proba(X_train)[:, 1]
roc_auc_score(y_train, y_pred)

In [None]:
y_pred = best_model_first.predict_proba(X_hold)[:, 1]
roc_auc_score(y_hold, y_pred)

In [None]:
df_feat= pd.DataFrame(data={'feat_name':best_model_first.feature_names_in_, 'importance': best_model_first.feature_importances_}).\
    sort_values(by='importance', ascending= False)
df_feat['cum']= df_feat['importance'].cumsum()

In [None]:
usless_feats = df_feat[df_feat['cum']>0.999]['feat_name']
df_feat[df_feat['cum']>0.999].shape, df_feat.shape

In [None]:
X_train_opt = X_train.drop(columns=usless_feats)
X_hold_opt = X_hold.drop(columns=usless_feats)
train_data_x_opt = train_data_x.drop(columns=usless_feats)
test_data_x_opt = test_data_x.drop(columns=usless_feats)

In [None]:
#params = {'n_estimators': [90], 'max_depth': [7], 'learning_rate': [0.3]}

#grid_search = GridSearchCV(estimator=XGBClassifier(random_state = 20), param_grid=params, cv=3, scoring='roc_auc', n_jobs=-1)

#grid_search.fit(X_train_opt, y_train)

In [None]:
#grid_search.best_params_, grid_search.best_score_

In [None]:
#XGBClassifier?

In [None]:
%time
model = XGBClassifier(n_estimators=1000,max_depth=50,learning_rate=1, random_state=20)
model.fit(X_train_opt, y_train)

In [None]:
y_pred = model.predict_proba(X_train_opt)[:, 1]
roc_auc_score(y_train, y_pred)

In [None]:
y_pred = model.predict_proba(X_hold_opt)[:, 1]
roc_auc_score(y_hold, y_pred)

In [None]:
y_pred = model.predict_proba(train_data_x_opt)[:, 1]
roc_auc_score(train_data_y, y_pred)

In [None]:
y_pred = model.predict_proba(test_data_x_opt)[:, 1]

In [148]:
result = pd.DataFrame(data={'dep_delayed_15min': y_pred})
result.to_csv('predict6.csv', header=True, index_label='id')