In [1]:
"""
##### NOTE: #####
Run this on Google Colab to take advantage of GPU hardware accelerator and larger RAM
Use saved files on Google Drive, and save the new files to Google Drive
Be sure to uncomment and recomment the correct lines to run on google drive.
Also, be sure to enable GPU hardware accelerator (runtime options --> GPU)
"""

import os
from timeit import default_timer as timer
import pandas as pd
from imblearn.over_sampling import SMOTE,RandomOverSampler,BorderlineSMOTE,ADASYN
#from imblearn.over_sampling import SMOTENC, KMeansSMOTE,SVMSMOTE  #not working yet
from imblearn.under_sampling import EditedNearestNeighbours,AllKNN,NearMiss,NeighbourhoodCleaningRule,OneSidedSelection,RandomUnderSampler,TomekLinks
#from imblearn.under_sampling import ClusterCentroids, CondensedNearestNeighbour, RepeatedEditedNearestNeighbours #not working yet
from imblearn.combine import SMOTEENN, SMOTETomek

#from google.colab import drive
#drive.mount('/content/drive')

import torch
import tensorflow as tf
tf.test.gpu_device_name()
# # Make device agnostic code
device = "cuda" if torch.cuda.is_available() else "cpu"
device

2023-07-21 22:37:26.536856: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


'cpu'

In [None]:
#datadir = '/Users/madelinefrank/Documents/KSU/Course_Work/5_Summer2023/CS_7265/Project/data'
datadir='/content/drive/MyDrive/KSU/MSCS'
cols=['DAYOFWEEK','DAYOFYEAR','CARRIER','ORIG','ORIG_ST','DEST','DEST_ST','CRS_HR_DEP','CRS_HR_ARR','DIST_GRP','ARR_DEL15_TF']
feats=['DAYOFWEEK','DAYOFYEAR','CARRIER','ORIG','ORIG_ST','DEST','DEST_ST','CRS_HR_DEP','CRS_HR_ARR','DIST_GRP']
label=['ARR_DEL15_TF']
dtypes={c: 'uint8' for c in cols if c!='DAYOFYEAR'}
dtypes['DAYOFYEAR']='int16'
dtypes_feats={k: v for k, v in dtypes.items() if k!=label[0]}
# These are all the ones that worked
resampleMethods = ['none_None','over_SMOTE','over_Random','over_BorderlineSMOTE','over_ASASYN','under_EditedNearestNeighbors','under_AllKNN','under_NearMiss','under_OneSidedSelection','under_Random','under_NeighborhoodCleaningRule','under_TomekLinks','combo_SMOTE_ENN','combo_SMOTE_TomekLinks']
resampleMethods = ['over_SMOTE','over_Random','over_BorderlineSMOTE','over_ADASYN','under_EditedNearestNeighbors','under_AllKNN','under_NearMiss','under_OneSidedSelection','under_Random','under_NeighborhoodCleaningRule','under_TomekLinks','combo_SMOTE_ENN','combo_SMOTE_TomekLinks']
def selectResampleMethod(str):
    match str:
        case  'none_None':                     resamp=None #no resampling
        case  'over_SMOTE':                    resamp=SMOTE(                    random_state=42          ) #GOOD
        case  'over_Random':                   resamp=RandomOverSampler(        random_state=42          ) #GOOD, fast
        case  'over_BorderlineSMOTE':          resamp=BorderlineSMOTE(          random_state=42          ) #GOOD
        case  'over_ADASYN':                   resamp=ADASYN(                   random_state=42          ) #GOOD, 4-5 min
        case 'under_EditedNearestNeighbors':   resamp=EditedNearestNeighbours(                  n_jobs=-1) #GOOD, 6min
        case 'under_AllKNN':                   resamp=AllKNN(                                   n_jobs=-1) #GOOD, 12-15min
        case 'under_NearMiss':                 resamp=NearMiss(                                 n_jobs=-1) #GOOD, 5min
        case 'under_OneSidedSelection':        resamp=OneSidedSelection(        random_state=42,n_jobs=-1) #GOOD, 15min
        case 'under_Random':                   resamp=RandomUnderSampler(       random_state=42,         ) #GOOD, super fast, a few seconds or less
        case 'under_NeighborhoodCleaningRule': resamp=NeighbourhoodCleaningRule(                n_jobs=-1) #GOOD, 10-15min
        case 'under_TomekLinks':               resamp=TomekLinks(                               n_jobs=-1) #GOOD, 5 min
        case 'combo_SMOTE_ENN':                resamp=SMOTEENN(                 random_state=42,         ) #GOOD, 20min
        case 'combo_SMOTE_TomekLinks':         resamp=SMOTETomek(               random_state=42,         ) #GOOD, 15min
        case _:                                resamp=None    #default                                     #default: no resampling
        # These are all the ones that didn't work as well as the reasons why they didn't work
        #case 'over_smotenc':    resamp=SMOTENC(                  random_state=42          ) #      TypeError: SMOTENC.__init__() missing 1 required positional arg: 'categorical_features'
        #case 'over_kmsmote':    resamp=KMeansSMOTE(              random_state=42          ) #      RuntimeError: No clusters found with sufficient samples of class 1. Try lowering cluster_balance_threshold or increasing # of clusters.
        #case 'over_svmsmote':   resamp=SVMSMOTE(                 random_state=42          ) #      Never finished, Takes FOREVER to run
        #case 'undr_cc':         resamp=ClusterCentroids(         random_state=42          ) #      Never finished, Takes FOREVER to run. FutureWarning: Default value of `n_init` will change from 10 to 'auto' in 1.4. Set `n_init` value explicitly to suppress warning
        #case 'undr_cnn':        resamp=CondensedNearestNeighbour(random_state=42,n_jobs=-1) #      Never finished, Takes FOREVER to run
        #case 'undr_renn':       resamp=RepeatedEditedNearestNeighbours(          n_jobs=-1) #      Never finished, Takes FOREVER to run
    return resamp

    
def printDataStats(X,y,resampleMethod):
    #print(X.info())
    nData=X.shape[0] # tota
    nDelayed=sum(y['ARR_DEL15_TF']) # delayed
    nOnTime=nData-nDelayed # not delayed
    pctDelayed=round((nDelayed/nData)*100,3)
    pctOnTime =round(100-pctDelayed,3)
    if 'none' in resampleMethod:
        print(f'\nData stats (no resampling):')
    else:
        print(f'\nData stats ({resampleMethod})')
    print(f'   Total samples:          {nData:12d}')
    print(f'   Not Delayed (class 0):  {nOnTime:12d} ... {pctOnTime:.3f}% of total data')
    print(f'   Delayed (class 1):      {nDelayed:12d} ... {pctDelayed:.3f}% of total data')


In [None]:
# Only need to run this once the first time to make sure you have the most-updated X_res_none_none and y_res_none_none files
# These are the "normal/original data with no resampling" files
all_data=pd.read_csv(os.path.join(datadir,'ALL_MODEL_DATA.csv'),index_col=False,dtype=dtypes)#.to(device) #couldn't figure out how to use device agnostic commands to run more on GPU
all_data.drop_duplicates(inplace=True) #dropping duplicates removes ~0.3% of the data points from of each of nondelayed and delayed data
X=all_data[feats]
y=all_data[label]
del all_data
printDataStats(X,y,'none_none')
X.to_csv(os.path.join(datadir,'X_res_none_None.csv'),index=False)
y.to_csv(os.path.join(datadir,'y_res_none_None.csv'),index=False)

In [None]:
# Loops through each resampling method
n=len(resampleMethods)
for i,r in enumerate(resampleMethods):
    print(f'\n\n\n--------------------{i+1:2d}/{n}-------------------------------')
    print(r)
    # opens fresh copy of "original" non-resampled data
    X=pd.read_csv(os.path.join(datadir,'X_res_none_None.csv'),index_col=False,dtype=dtypes_feats)
    y=pd.read_csv(os.path.join(datadir,'y_res_none_None.csv'),index_col=False,dtype='uint8')
    resamp=selectResampleMethod(r)
    s=timer() #start timer
    # Perform resampling
    X_res,y_res=resamp.fit_resample(X,y)
    e=timer() #end timer
    del X,y # delete original X and y data to save space
    printDataStats(X_res,y_res,r) # print data stats
    print(f'{r} runtime: {(e-s)/60} min') #print runtime
    #save resampled data to new files
    y_res.to_csv(os.path.join(datadir,f'y_res_{r}.csv'),index=False)
    X_res.to_csv(os.path.join(datadir,f'X_res_{r}.csv'),index=False)
    

In [3]:
1-(9909212/9944494),1-(2324567/2332041)

(0.003547892934522401, 0.0032049179238272663)

Without dropping duplicates

Data stats (no resampling):

   Total samples:              12276535

   Not Delayed (class 0):       9944494 ... 81.004% of total data

   Delayed (class 1):           2332041 ... 18.996% of total data




-------------------------------------------------------

over_SMOTE

   Total samples:              19888988

   Not Delayed (class 0):       9944494 ... 50.000% of total data

   Delayed (class 1):           9944494 ... 50.000% of total data

runtime: 1.7102470267500014 min




--------------------------------------------------------

over_Random

   Total samples:              19888988

   Not Delayed (class 0):       9944494 ... 50.000% of total data

   Delayed (class 1):           9944494 ... 50.000% of total data

runtime: 0.1265190169333361 min




--------------------------------------------------------

over_BorderlineSMOTE

   Total samples:              19888988

   Not Delayed (class 0):       9944494 ... 50.000% of total data

   Delayed (class 1):           9944494 ... 50.000% of total data

runtime: 4.576688885733335 min




--------------------------------------------------------

over_ADASYN

   Total samples:              20653993

   Not Delayed (class 0):       9944494 ... 48.148% of total data

   Delayed (class 1):          10709499 ... 51.852% of total data

runtime: 4.113929089800001 min




--------------------------------------------------------

under_EditedNearestNeighbors

   Total samples:               8353091

   Not Delayed (class 0):       6021050 ... 72.082% of total data

   Delayed (class 1):           2332041 ... 27.918% of total data

runtime: 5.522951694166666 min




--------------------------------------------------------

under_AllKNN

   Total samples:               7988038

   Not Delayed (class 0):       5655997 ... 70.806% of total data

   Delayed (class 1):           2332041 ... 29.194% of total data

runtime: 12.766721452483337 min




--------------------------------------------------------

under_NearMiss

   Total samples:               4664082

   Not Delayed (class 0):       2332041 ... 50.000% of total data

   Delayed (class 1):           2332041 ... 50.000% of total data

runtime: 4.9159594337166705 min




--------------------------------------------------------

under_OneSidedSelection

   Total samples:              11815301

   Not Delayed (class 0):       9483260 ... 80.263% of total data

   Delayed (class 1):           2332041 ... 19.737% of total data

runtime: 13.474105738283333 min




--------------------------------------------------------

under_Random

   Total samples:               4664082

   Not Delayed (class 0):       2332041 ... 50.000% of total data

   Delayed (class 1):           2332041 ... 50.000% of total data

runtime: 0.0499391086166573 min




--------------------------------------------------------

under_NeighborhoodCleaningRule

   Total samples:               8443302

   Not Delayed (class 0):       6111261 ... 72.380% of total data

   Delayed (class 1):           2332041 ... 27.620% of total data

runtime: 11.363556440133333 min




--------------------------------------------------------

under_TomekLinks

   Total samples:              11577126

   Not Delayed (class 0):       9245085 ... 79.856% of total data

   Delayed (class 1):           2332041 ... 20.144% of total data

under_TomekLinks runtime: 5.559590177799994 min




--------------------------------------------------------

combo_SMOTE_ENN

   Total samples:              11023974

   Not Delayed (class 0):       5343087 ... 48.468% of total data

   Delayed (class 1):           5680887 ... 51.532% of total data

runtime: 20.252451183449995 min




--------------------------------------------------------

combo_SMOTE_TomekLinks

   Total samples:              19068736

   Not Delayed (class 0):       9534368 ... 50.000% of total data

   Delayed (class 1):           9534368 ... 50.000% of total data
   
combo_SMOTE_TomekLinks runtime: 17.217875949233335 min


With dropping duplicates

Data stats (no resampling):

   Total samples:              12233779

   Not Delayed (class 0):       9909212 ... 80.999% of total data

   Delayed (class 1):           2324567 ... 19.001% of total data




-------------------------------------------------------

over_SMOTE

   Total samples:              19818424

   Not Delayed (class 0):       9909212 ... 50.000% of total data

   Delayed (class 1):           9909212 ... 50.000% of total data

runtime: 1.7735213822000029 min




--------------------------------------------------------

over_Random

   Total samples:              19818424

   Not Delayed (class 0):       9909212 ... 50.000% of total data

   Delayed (class 1):           9909212 ... 50.000% of total data

runtime: 0.1045707474166799 min




--------------------------------------------------------

over_BorderlineSMOTE

   Total samples:              19818424

   Not Delayed (class 0):       9909212 ... 50.000% of total data

   Delayed (class 1):           9909212 ... 50.000% of total data

runtime: 4.652512656816665 min




--------------------------------------------------------

over_ADASYN

   Total samples:              20594073

   Not Delayed (class 0):       9909212 ... 48.117% of total data

   Delayed (class 1):          10684861 ... 51.883% of total data

runtime: 4.136470490966652 min




--------------------------------------------------------

under_EditedNearestNeighbors

   Total samples:               8311647

   Not Delayed (class 0):       5987080 ... 72.032% of total data

   Delayed (class 1):           2324567 ... 27.968% of total data

runtime: 5.433976183816685 min




--------------------------------------------------------

under_AllKNN

   Total samples:               7941599

   Not Delayed (class 0):       5617032 ... 70.729% of total data

   Delayed (class 1):           2324567 ... 29.271% of total data

runtime: 12.596392185250018 min




--------------------------------------------------------

under_NearMiss

   Total samples:               4649134

   Not Delayed (class 0):       2324567 ... 50.000% of total data

   Delayed (class 1):           2324567 ... 50.000% of total data

runtime: 5.094309101900005 min




--------------------------------------------------------

under_OneSidedSelection

   Total samples:              11768947

   Not Delayed (class 0):       9444380 ... 80.248% of total data

   Delayed (class 1):           2324567 ... 19.752% of total data

runtime: 13.45052021369999 min




--------------------------------------------------------

under_Random

   Total samples:               4649134

   Not Delayed (class 0):       2324567 ... 50.000% of total data

   Delayed (class 1):           2324567 ... 50.000% of total data

runtime: 0.06166807774998233 min




--------------------------------------------------------

under_NeighborhoodCleaningRule

   Total samples:               8403260

   Not Delayed (class 0):       6078693 ... 72.337% of total data

   Delayed (class 1):           2324567 ... 27.663% of total data

runtime: 11.325413986749997 min




--------------------------------------------------------

under_TomekLinks

   Total samples:              11525387

   Not Delayed (class 0):       9200820 ... 79.831% of total data

   Delayed (class 1):           2324567 ... 20.169% of total data

runtime: 5.550571789733325 min




--------------------------------------------------------

combo_SMOTE_ENN

   Total samples:              10950919

   Not Delayed (class 0):       5307890 ... 48.470% of total data

   Delayed (class 1):           5643029 ... 51.530% of total data

runtime: 20.379325736983333 min




--------------------------------------------------------

combo_SMOTE_TomekLinks

   Total samples:              18995946

   Not Delayed (class 0):       9497973 ... 50.000% of total data

   Delayed (class 1):           9497973 ... 50.000% of total data
   
runtime: 17.309690261616666 min
