In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Sklearn libraries for model building
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


In [5]:
# Load the dataset
df = pd.read_csv("flights_sample_3m.csv")

# Preview the first few rows
print(df.head())


      FL_DATE                AIRLINE                AIRLINE_DOT AIRLINE_CODE  \
0  2019-01-09  United Air Lines Inc.  United Air Lines Inc.: UA           UA   
1  2022-11-19   Delta Air Lines Inc.   Delta Air Lines Inc.: DL           DL   
2  2022-07-22  United Air Lines Inc.  United Air Lines Inc.: UA           UA   
3  2023-03-06   Delta Air Lines Inc.   Delta Air Lines Inc.: DL           DL   
4  2020-02-23       Spirit Air Lines       Spirit Air Lines: NK           NK   

   DOT_CODE  FL_NUMBER ORIGIN          ORIGIN_CITY DEST  \
0     19977       1562    FLL  Fort Lauderdale, FL  EWR   
1     19790       1149    MSP      Minneapolis, MN  SEA   
2     19977        459    DEN           Denver, CO  MSP   
3     19790       2295    MSP      Minneapolis, MN  SFO   
4     20416        407    MCO          Orlando, FL  DFW   

               DEST_CITY  ...  DIVERTED  CRS_ELAPSED_TIME  ELAPSED_TIME  \
0             Newark, NJ  ...       0.0             186.0         176.0   
1            S

In [6]:
# Check for missing values
print(df.isnull().sum())

# Fill missing delay values with 0 (assuming no delay)
delay_columns = ['DELAY_DUE_CARRIER', 'DELAY_DUE_WEATHER', 'DELAY_DUE_NAS', 
                 'DELAY_DUE_SECURITY', 'DELAY_DUE_LATE_AIRCRAFT']
df[delay_columns] = df[delay_columns].fillna(0)

# Dropping rows with missing important features like 'ARR_DELAY'
df = df.dropna(subset=['ARR_DELAY'])

# Convert delay into a binary classification: delayed (1) or not delayed (0)
df['DELAYED'] = df['ARR_DELAY'].apply(lambda x: 1 if x > 15 else 0)

# Fill missing 'DEP_DELAY' with 0 for no departure delay info
df['DEP_DELAY'].fillna(0, inplace=True)

# Convert categorical columns to appropriate types
df['FL_DATE'] = pd.to_datetime(df['FL_DATE'])
df['AIRLINE'] = df['AIRLINE'].astype('category')
df['ORIGIN'] = df['ORIGIN'].astype('category')
df['DEST'] = df['DEST'].astype('category')


FL_DATE                          0
AIRLINE                          0
AIRLINE_DOT                      0
AIRLINE_CODE                     0
DOT_CODE                         0
FL_NUMBER                        0
ORIGIN                           0
ORIGIN_CITY                      0
DEST                             0
DEST_CITY                        0
CRS_DEP_TIME                     0
DEP_TIME                     77615
DEP_DELAY                    77644
TAXI_OUT                     78806
WHEELS_OFF                   78806
WHEELS_ON                    79944
TAXI_IN                      79944
CRS_ARR_TIME                     0
ARR_TIME                     79942
ARR_DELAY                    86198
CANCELLED                        0
CANCELLATION_CODE          2920860
DIVERTED                         0
CRS_ELAPSED_TIME                14
ELAPSED_TIME                 86198
AIR_TIME                     86198
DISTANCE                         0
DELAY_DUE_CARRIER          2466137
DELAY_DUE_WEATHER   

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['DEP_DELAY'].fillna(0, inplace=True)


In [7]:
# Create dummy variables for categorical columns
df = pd.get_dummies(df, columns=['AIRLINE', 'ORIGIN', 'DEST'], drop_first=True)

# Drop unnecessary columns like city names and flight times that aren't usable for modeling
df = df.drop(['FL_DATE', 'ORIGIN_CITY', 'DEST_CITY', 'CANCELLATION_CODE'], axis=1)

# Check the final structure of the dataset
print(df.head())


                 AIRLINE_DOT AIRLINE_CODE  DOT_CODE  FL_NUMBER  CRS_DEP_TIME  \
0  United Air Lines Inc.: UA           UA     19977       1562          1155   
1   Delta Air Lines Inc.: DL           DL     19790       1149          2120   
2  United Air Lines Inc.: UA           UA     19977        459           954   
3   Delta Air Lines Inc.: DL           DL     19790       2295          1609   
4       Spirit Air Lines: NK           NK     20416        407          1840   

   DEP_TIME  DEP_DELAY  TAXI_OUT  WHEELS_OFF  WHEELS_ON  ...  DEST_VEL  \
0    1151.0       -4.0      19.0      1210.0     1443.0  ...     False   
1    2114.0       -6.0       9.0      2123.0     2232.0  ...     False   
2    1000.0        6.0      20.0      1020.0     1247.0  ...     False   
3    1608.0       -1.0      27.0      1635.0     1844.0  ...     False   
4    1838.0       -2.0      15.0      1853.0     2026.0  ...     False   

   DEST_VLD  DEST_VPS  DEST_WRG  DEST_WYS  DEST_XNA  DEST_XWA  DEST_YAK  \

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Sklearn libraries for model building
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset from a CSV file
df = pd.read_csv("flights.csv")

# Preview the first few rows of the DataFrame
print(df.head())

# Check for missing values
print(df.isnull().sum())

# Fill missing delay values with 0 (assuming no delay)
delay_columns = ['DELAY_DUE_CARRIER', 'DELAY_DUE_WEATHER', 'DELAY_DUE_NAS', 
                 'DELAY_DUE_SECURITY', 'DELAY_DUE_LATE_AIRCRAFT']
df[delay_columns] = df[delay_columns].fillna(0)

# Dropping rows with missing important features like 'ARR_DELAY'
df = df.dropna(subset=['ARR_DELAY'])

# Convert delay into a binary classification: delayed (1) or not delayed (0)
df['DELAYED'] = df['ARR_DELAY'].apply(lambda x: 1 if x > 15 else 0)

# Fill missing 'DEP_DELAY' with 0 for no departure delay info
df['DEP_DELAY'].fillna(0, inplace=True)

# Convert categorical columns to appropriate types
df['FL_DATE'] = pd.to_datetime(df['FL_DATE'])
df['AIRLINE'] = df['AIRLINE'].astype('category')
df['ORIGIN'] = df['ORIGIN'].astype('category')
df['DEST'] = df['DEST'].astype('category')

# Create dummy variables for categorical columns
df = pd.get_dummies(df, columns=['AIRLINE', 'ORIGIN', 'DEST'], drop_first=True)

# Drop unnecessary columns like city names and flight times that aren't usable for modeling
df = df.drop(['FL_DATE', 'ORIGIN_CITY', 'DEST_CITY', 'CANCELLATION_CODE'], axis=1)

# Check the final structure of the dataset
print(df.head())

# Features and target variable
X = df.drop(['ARR_DELAY', 'DELAYED'], axis=1)  # Features
y = df['DELAYED']  # Target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Check for any non-numeric columns in X_train
print(X_train.dtypes)

# Handle any missing values if present
X_train = X_train.fillna(0)    # Fill missing values (or use an appropriate strategy)

# Standardize the features (make sure they're numeric)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize and train the Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf.predict(X_test)

# Evaluate the model's performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Plotting feature importance
importances = rf.feature_importances_
features = X.columns
indices = np.argsort(importances)[::-1]

# Plot the feature importances
plt.figure(figsize=(10, 6))
plt.title("Feature Importance")
plt.bar(range(X.shape[1]), importances[indices], align="center")
plt.xticks(range(X.shape[1]), features[indices], rotation=90)
plt.show()


      FL_DATE                AIRLINE                AIRLINE_DOT AIRLINE_CODE  \
0  2019-01-09  United Air Lines Inc.  United Air Lines Inc.: UA           UA   
1  2022-11-19   Delta Air Lines Inc.   Delta Air Lines Inc.: DL           DL   
2  2022-07-22  United Air Lines Inc.  United Air Lines Inc.: UA           UA   
3  2023-03-06   Delta Air Lines Inc.   Delta Air Lines Inc.: DL           DL   
4  2020-02-23       Spirit Air Lines       Spirit Air Lines: NK           NK   

   DOT_CODE  FL_NUMBER ORIGIN          ORIGIN_CITY DEST  \
0     19977       1562    FLL  Fort Lauderdale, FL  EWR   
1     19790       1149    MSP      Minneapolis, MN  SEA   
2     19977        459    DEN           Denver, CO  MSP   
3     19790       2295    MSP      Minneapolis, MN  SFO   
4     20416        407    MCO          Orlando, FL  DFW   

               DEST_CITY  ...  DIVERTED  CRS_ELAPSED_TIME  ELAPSED_TIME  \
0             Newark, NJ  ...       0.0             186.0         176.0   
1            S

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['DEP_DELAY'].fillna(0, inplace=True)


                 AIRLINE_DOT AIRLINE_CODE  DOT_CODE  FL_NUMBER  CRS_DEP_TIME  \
0  United Air Lines Inc.: UA           UA     19977       1562          1155   
1   Delta Air Lines Inc.: DL           DL     19790       1149          2120   
2  United Air Lines Inc.: UA           UA     19977        459           954   
3   Delta Air Lines Inc.: DL           DL     19790       2295          1609   
4       Spirit Air Lines: NK           NK     20416        407          1840   

   DEP_TIME  DEP_DELAY  TAXI_OUT  WHEELS_OFF  WHEELS_ON  ...  DEST_VEL  \
0    1151.0       -4.0      19.0      1210.0     1443.0  ...     False   
1    2114.0       -6.0       9.0      2123.0     2232.0  ...     False   
2    1000.0        6.0      20.0      1020.0     1247.0  ...     False   
3    1608.0       -1.0      27.0      1635.0     1844.0  ...     False   
4    1838.0       -2.0      15.0      1853.0     2026.0  ...     False   

   DEST_VLD  DEST_VPS  DEST_WRG  DEST_WYS  DEST_XNA  DEST_XWA  DEST_YAK  \

ValueError: could not convert string to float: 'SkyWest Airlines Inc.: OO'

In [4]:
from sklearn.impute import SimpleImputer

# Convert y_train (a pandas Series) to a NumPy array and reshape it
y_train_array = y_train.values.reshape(-1, 1)  # or y_train.to_numpy().reshape(-1, 1)

# Create an imputer to fill missing values in y_train
y_imputer = SimpleImputer(strategy='most_frequent')  # Or 'mean', 'median', etc.
y_train_imputed = y_imputer.fit_transform(y_train_array).ravel()


In [6]:
# Remove rows where y_train is NaN
import numpy as np
mask = ~np.isnan(y_train)
X_train = X_train[mask]
y_train = y_train[mask]


In [13]:
# Sample 10% of the data
df_sampled = df.sample(frac=0.1, random_state=42)  # Adjust the fraction as needed


In [18]:
import pandas as pd

# Load your dataset into a DataFrame (replace with your file path)
# df = pd.read_csv('your_file.csv')  # Uncomment and set your CSV file path

# Example DataFrame creation for demonstration
data = {
    'FL_DATE': ['2024-01-01', '2024-01-02', '2024-01-03'],
    'AIRLINE': ['Delta', 'United', 'Southwest'],
    'DEP_DELAY': [10, None, 5],
    'ARR_DELAY': [None, 15, 5],
    'DISTANCE': [500, 600, 700],
    'CANCELLED': [0, 1, 0]
}
df = pd.DataFrame(data)

# Check available columns
print("Columns in DataFrame:")
print(df.columns.tolist())

# Strip spaces from column names (if any)
df.columns = df.columns.str.strip()

# Verify if the target column exists in the DataFrame
target_column = 'ARR_DELAY'  # Change as necessary
if target_column not in df.columns:
    raise KeyError(f"Target column '{target_column}' not found in the DataFrame.")

# Separate features and target variable
X = df.drop(columns=[target_column])  # Features
y = df[target_column]                   # Target variable

# Impute missing values in y (if necessary)
from sklearn.impute import SimpleImputer

y_imputer = SimpleImputer(strategy='most_frequent')
y = y_imputer.fit_transform(y.values.reshape(-1, 1)).ravel()

# Now you can proceed with your pipeline


Columns in DataFrame:
['FL_DATE', 'AIRLINE', 'DEP_DELAY', 'ARR_DELAY', 'DISTANCE', 'CANCELLED']


In [26]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Sample DataFrame creation for demonstration
data = {
    'FL_DATE': ['2024-01-01', '2024-01-02', '2024-01-03'],
    'AIRLINE': ['Delta', 'United', 'Southwest'],
    'DEP_DELAY': [10, None, 5],  # Example target variable
    'ARR_DELAY': [None, 15, 5],
    'DISTANCE': [500, 600, 700],
    'CANCELLED': [0, 1, 0]
}
df = pd.DataFrame(data)

# Check available columns
print("Columns in DataFrame:")
print(df.columns.tolist())

# Choose the target column (make sure it exists)
target_column = 'DEP_DELAY'  # Change this to 'ARR_DELAY' or any other column if needed

# Ensure the target column exists
if target_column not in df.columns:
    raise KeyError(f"Target column '{target_column}' not found in the DataFrame.")

# Separate features and target variable
X = df.drop(columns=[target_column])  # Features
y = df[target_column]                   # Target variable

# Impute missing values in y (if necessary)
y_imputer = SimpleImputer(strategy='mean')  # Change as needed
y = y_imputer.fit_transform(y.values.reshape(-1, 1)).ravel()

# Identify numeric and categorical columns
numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()

# Print identified columns for verification
print("Numeric columns:", numeric_columns)
print("Categorical columns:", categorical_columns)

# Numeric features imputation and scaling
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute numeric values
    ('scaler', StandardScaler())])  # Scale numeric values

# Categorical features (using sparse matrix)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute categorical values
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=True))])  # One-hot encoding

# Define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

# Define pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Train the model
pipeline.fit(X, y)

# To make predictions, you would typically use:
# y_pred = pipeline.predict(X_test)  # Assuming X_test is defined


Columns in DataFrame:
['FL_DATE', 'AIRLINE', 'DEP_DELAY', 'ARR_DELAY', 'DISTANCE', 'CANCELLED']
Numeric columns: ['ARR_DELAY', 'DISTANCE', 'CANCELLED']
Categorical columns: ['FL_DATE', 'AIRLINE']




ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.