

**Step 1: Load the dataset and drop TailNum**



In [None]:
import pandas as pd
import numpy as np

#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


In [None]:
# Load the csv file from drive
k_df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/FlightCSV/Copy of FlightData.csv')
k_df.head()

In [None]:
k_df.info()


In [None]:
# Drop 'TailNum' column
k_df = k_df.drop(columns=['TailNum'], errors='ignore')
k_df.head()


**Step:2 Define the target — flight delay **

In [None]:
# Use the condition for Delayed column
# Convert 'ArrDelay' column to numeric, coerce errors to NaN
k_df['ArrDelay'] = pd.to_numeric(k_df['ArrDelay'], errors='coerce')
# Apply the lambda function to create the 'Delayed' column
k_df['Delayed'] = k_df['ArrDelay'].apply(lambda x: 1 if x > 15 else 0)

k_df.head()

**Step:3)Preprocess the Data**

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

#Drop target related column to avoid Leakege
X = k_df.drop(columns=['ArrDelay','Delayed'])
y = k_df['Delayed']

#Verify ArrDelay and Delayed columns were created
print(k_df.columns)



In [None]:
# Drop target related column to avoid Leakege
X = k_df.drop(columns=['ArrDelay','Delayed'])
y = k_df['Delayed']

#Identify object (categorical) columns
cat_cols = X.select_dtypes(include=['object']).columns
print(cat_cols)


In [None]:
#Identify Numerical Columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
print(num_cols)

In [None]:
# Drop high-cardinality object columns (optional: choose based on cardinality)
high_card_cols = [col for col in cat_cols if X[col].nunique() > 100]
X.drop(columns=high_card_cols, inplace=True)

In [None]:
# Label encode remaining categorical columns
from sklearn.preprocessing import LabelEncoder
label_encoders = {}
for col in X.select_dtypes(include='object').columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

In [None]:
# Fill missing values efficiently (numeric only now)
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

In [None]:
# Train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score

gbm = GradientBoostingClassifier(random_state=42)
gbm.fit(X_train, y_train)
y_pred = gbm.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
#Let's plot the distribution of the Delayed column to understand how many flights were delayed vs. on time.
import matplotlib.pyplot as plt
import seaborn as sns

# Plot
plt.figure(figsize=(6, 4))
sns.countplot(x='Delayed', data=k_df, palette='Set2')
plt.title('Flight Delay Distribution')
plt.xlabel('Delayed (0 = No, 1 = Yes)')
plt.ylabel('Number of Flights')
plt.xticks([0, 1], ['On Time', 'Delayed'])
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import precision_score, accuracy_score, f1_score, classification_report

# Predict on test data
y_pred = gbm.predict(X_test)

# Binary classification metrics
precision = precision_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print results
print(f"Accuracy Score:  {accuracy:.4f}")
print(f"Precision Score: {precision:.4f}")
print(f"F1 Score:        {f1:.4f}")

# Optional: detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Get feature importances
importances = gbm.feature_importances_
feature_names = X.columns
feature_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

# Sort by importance
feature_df = feature_df.sort_values(by='Importance', ascending=False).head(15)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_df, palette='viridis')
plt.title('Top 15 Important Features in Predicting Flight Delays')
plt.tight_layout()
plt.show()


In [None]:
#Code to View Arrival and Departure Delays
# Display the first few rows of arrival and departure delays
k_df[['ArrDelay', 'DepDelay']].head()



In [None]:
#If you want to explore their statistics (like mean, std, min, max), you can also do:

k_df[['ArrDelay', 'DepDelay']].describe()

In [None]:
#Or, if you want to print all rows of just those two columns:
# Show all rows for ArrDelay and DepDelay (use with caution for large datasets)
print(k_df[['ArrDelay', 'DepDelay','Delayed']])


In [None]:
#delays across specific years (1997,2002, 2005, 2006, 2007)
import matplotlib.pyplot as plt
import seaborn as sns

# Filter for the specific years
years_of_interest = [1997, 2002, 2005, 2006, 2007]
df_years = k_df[k_df['Year'].isin(years_of_interest)]

# Group by year and delayed flag
delay_by_year = df_years.groupby(['Year', 'Delayed']).size().reset_index(name='Count')

# Plot
plt.figure(figsize=(8, 5))
sns.barplot(data=delay_by_year, x='Year', y='Count', hue='Delayed', palette='Set1')
plt.title('Flight Delays (Delayed vs On-Time) for Selected Years')
plt.xlabel('Year')
plt.ylabel('Number of Flights')
plt.legend(title='Delayed (0 = On-Time, 1 = Delayed)')
plt.tight_layout()
plt.show()


In [None]:
missing_values = k_df.isnull().sum()
print("Missing values per column:\n", missing_values)


In [None]:
print("Data types of each column:\n", k_df.dtypes)


In [None]:
duplicates = k_df.duplicated()
print(f"Number of duplicate rows: {duplicates.sum()}")
k_df = k_df.drop_duplicates()


In [None]:
print("Arrival Delay statistics:\n", k_df['ArrDelay'].describe())
print("Departure Delay statistics:\n", k_df['DepDelay'].describe())


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Predict on test set
y_pred = gbm.predict(X_test)

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Display confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['On Time (0)', 'Delayed (1)'])
disp.plot(cmap='Blues')
plt.title("Confusion Matrix - GBM Model")
plt.tight_layout()
plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Get predicted probabilities for the positive class (Delayed = 1)
y_probs = gbm.predict_proba(X_test)[:, 1]

# Compute False Positive Rate, True Positive Rate, and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_probs)

# Compute AUC (Area Under Curve)
roc_auc = auc(fpr, tpr)

# Plot ROC Curve
plt.figure(figsize=(8, 5))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'GBM ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--', label='Random Guess')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC Curve - GBM Model')
plt.legend(loc="lower right")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Get prediction probabilities for class 1 (Delayed)
y_probs = gbm.predict_proba(X_test)[:, 1]

# Create threshold values between 0 and 1
thresholds = np.linspace(0, 1, 100)
false_positives = []
false_negatives = []

# Loop through each threshold
for thresh in thresholds:
    y_pred_thresh = (y_probs >= thresh).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_thresh).ravel()
    false_positives.append(fp)
    false_negatives.append(fn)

# Plot FP and FN
plt.figure(figsize=(10, 6))
plt.plot(thresholds, false_positives, label='False Positives', color='red')
plt.plot(thresholds, false_negatives, label='False Negatives', color='blue')
plt.title('False Positives and False Negatives vs Classification Threshold')
plt.xlabel('Threshold')
plt.ylabel('Count')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
from sklearn.metrics import precision_score, recall_score

# Prepare arrays
precision_list = []
recall_list = []
thresholds = np.linspace(0, 1, 100)

# Loop through thresholds and compute precision and recall
for thresh in thresholds:
    y_pred_thresh = (y_probs >= thresh).astype(int)
    precision = precision_score(y_test, y_pred_thresh, zero_division=0)
    recall = recall_score(y_test, y_pred_thresh)
    precision_list.append(precision)
    recall_list.append(recall)

# Plot
plt.figure(figsize=(10, 6))
plt.plot(thresholds, precision_list, label='Precision', color='green')
plt.plot(thresholds, recall_list, label='Recall', color='purple')
plt.title('Precision and Recall vs Classification Threshold')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Define model names
models = ["Random Forest", "SGD Classifier", "Logistic Regression", "XGBoost", "Gradient Boosting"]

# Define metrics
accuracy = [0.70, 0.71, 0.72, 0.91, 0.81]
precision = [0.70, 0.72, 0.72, 0.91, 0.82]
recall = [0.70, 0.71, 0.72, 0.91, 0.50]
f1_score = [0.70, 0.71, 0.72, 0.91, 0.72]

# Set position of bar on X axis
x = np.arange(len(models))
width = 0.2  # Width of bars

# Plotting the grouped bar chart
plt.figure(figsize=(12, 6))
plt.bar(x - 1.5*width, accuracy, width, label='Accuracy')
plt.bar(x - 0.5*width, precision, width, label='Precision')
plt.bar(x + 0.5*width, recall, width, label='Recall')
plt.bar(x + 1.5*width, f1_score, width, label='F1 Score')

# Labels and Title
plt.xlabel('Models')
plt.ylabel('Scores')
plt.title('Model Comparison Based on Classification Metrics')
plt.xticks(x, models, rotation=15)
plt.ylim(0, 1.05)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

plt.show()


In [None]:
import pandas as pd
import numpy as np


# Step 4: Extract useful time-based features if 'CRSDepTime' exists
if 'CRSDepTime' in k_df.columns:
    k_df['CRSDepTime'] = pd.to_numeric(k_df['CRSDepTime'], errors='coerce')
    k_df['DepHour'] = k_df['CRSDepTime'] // 100
    k_df['DepMinute'] = k_df['CRSDepTime'] % 100

# Step 5: Encode categorical variables (e.g., Origin, Dest, Carrier)
from sklearn.preprocessing import LabelEncoder
for col in k_df.select_dtypes(include='object').columns:
    # Use k_df instead of df
    k_df[col] = LabelEncoder().fit_transform(k_df[col].astype(str))

# Step 6: Handle missing values
k_df.fillna(k_df.mean(numeric_only=True), inplace=True)

# Step 7: Normalize/scale features if needed (optional, for model input)
from sklearn.preprocessing import StandardScaler
features = k_df.drop(columns=['Delayed'])  # exclude label
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(features), columns=features.columns)
df_scaled['Delayed'] =k_df['Delayed']  # re-add the label

# Step 8: Show 10 samples
sample_10 = df_scaled.sample(10, random_state=42)
print(sample_10)
