In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from typing import Dict
from sklearn.preprocessing import LabelEncoder

In [3]:
parent_directory = os.path.dirname(os.getcwd())
dataset_folder_path = os.path.join(parent_directory, 'dataset')

In [4]:
df = pd.read_csv(os.path.join(dataset_folder_path, "flight_delays_train.csv"))

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Month              100000 non-null  object
 1   DayofMonth         100000 non-null  object
 2   DayOfWeek          100000 non-null  object
 3   DepTime            100000 non-null  int64 
 4   UniqueCarrier      100000 non-null  object
 5   Origin             100000 non-null  object
 6   Dest               100000 non-null  object
 7   Distance           100000 non-null  int64 
 8   dep_delayed_15min  100000 non-null  object
dtypes: int64(2), object(7)
memory usage: 6.9+ MB


Cleaning Dataset

In [6]:
calendar_type_cols = ["Month", "DayofMonth", "DayOfWeek"]
for col in df.columns:
    if col in calendar_type_cols:
        df[col] = df[col].str.replace('c-', '').astype(int)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Month              100000 non-null  int64 
 1   DayofMonth         100000 non-null  int64 
 2   DayOfWeek          100000 non-null  int64 
 3   DepTime            100000 non-null  int64 
 4   UniqueCarrier      100000 non-null  object
 5   Origin             100000 non-null  object
 6   Dest               100000 non-null  object
 7   Distance           100000 non-null  int64 
 8   dep_delayed_15min  100000 non-null  object
dtypes: int64(5), object(4)
memory usage: 6.9+ MB


Transform Categorical Values

In [8]:
categorical_columns = [col for col in df.columns if df[col].dtype == "object"]

In [9]:
categorical_columns

['UniqueCarrier', 'Origin', 'Dest', 'dep_delayed_15min']

In [10]:
encoder_list: Dict[str, LabelEncoder] = []

In [11]:
for col in df.columns:
    if col in categorical_columns:
        le = LabelEncoder()
        le.fit(df[col].tolist())
        df[col] = le.transform(df[col].tolist())
        encoder_list.append({"encoder_name": col, "encoder": le})

In [12]:
df_X = df.drop(columns=["dep_delayed_15min"])
df_y = df["dep_delayed_15min"]

In [13]:
import wandb
from datetime import datetime
today_date = datetime.today().date().strftime(format="%Y-%m-%d")
run = wandb.init(
    project="flight-delay-prediction",
    group="Experiment"+str(today_date),
    notes="Experiment created on "+ str(today_date),
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msharvin-kogilavanan[0m ([33msharvin-dev[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [14]:
run.finish()

In [None]:
%%wandb