In [None]:
'''
INFO_511_ Application Exercise 04: NYC Flights
Author: Todd Adams
Date: 04/06/2024
Description: We are answering questions related to the NYC Flights dataset.
Note: I used VS Code and ChatGPT to help me write this code.
'''


**Load Libraries**

In [None]:
# Load libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
import numpy as np
from nycflights13 import flights


**Exercise 1 - Load Data**

In [None]:
# Load the flights dataset
flights = pd.read_csv("data/flights.csv")

# Display the first few rows of the dataset and its info
flights.head()
flights.info()
flights.describe()

# Count the number of rows in the dataset
flights_df = flights.copy()
len(flights_df)

# Number of rows
num_rows = len(flights)
print(f"The flights dataset has {num_rows} rows.")


The `flights` data frame has `336776` rows. Each row represents a `single flight departing NYC in 2013`.


**Exercise 2 - Data Cleaning**

In [None]:
# Remove rows with missing values in 'arr_delay' and 'distance'
flights_clean = flights.dropna(subset=['arr_delay', 'distance'])
flights_clean.columns.tolist()

# Display how many rows were originally in the dataset and how many are left after cleaning
print(f"Original rows: {len(flights)}")
print(f"Cleaned rows: {len(flights_clean)}")

# View column names
flights_clean.columns.tolist()


**Exercise 3 - Original Data Distribution**

In [None]:
# Set up plot style
sns.set(style="whitegrid")

# Create side-by-side histograms
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram for arr_delay
sns.histplot(data=flights, x="arr_delay", bins=100, kde=False, ax=axes[0], color='skyblue')
axes[0].set_title("Arrival Delay Distribution")
axes[0].set_xlabel("Arrival Delay (minutes)")
axes[0].set_ylabel("Count")

# Histogram for distance
sns.histplot(data=flights, x="distance", bins=50, kde=False, ax=axes[1], color='salmon')
axes[1].set_title("Flight Distance Distribution")
axes[1].set_xlabel("Distance (miles)")
axes[1].set_ylabel("Count")

plt.tight_layout()
plt.show()


**Exercise 4 - Check for Skewness**

In [None]:
# Calculate skewness
arr_delay_skew = flights['arr_delay'].skew()
distance_skew = flights['distance'].skew()

# Print the results
print(f"Skewness of arr_delay: {arr_delay_skew:.2f}")
print(f"Skewness of distance: {distance_skew:.2f}")


**Exercise  - Scaling**

In [None]:
# Summary statistics to check for scale differences
flights_clean[['arr_delay', 'distance']].describe()

'''
Yes, both `arr_delay` and `distance` need to be scaled. While they are both numeric, they are measured on **very different scales**:  
`arr_delay` ranges in minutes and includes negative/positive values, while `distance` can span from short to long-haul flights (e.g., 100–3000+ miles).  
Scaling ensures that models or plots using both features give them **equal weight** and aren’t biased by magnitude.  
'''

# Create copies to avoid modifying the original
df_clean = flights_clean.copy()

# Initialize scalers
scaler_standard = StandardScaler()
scaler_maxabs = MaxAbsScaler()
scaler_minmax = MinMaxScaler()

# Fit and transform
df_clean['arr_delay_standard'] = scaler_standard.fit_transform(df_clean[['arr_delay']])
df_clean['distance_standard'] = scaler_standard.fit_transform(df_clean[['distance']])

df_clean['arr_delay_maxabs'] = scaler_maxabs.fit_transform(df_clean[['arr_delay']])
df_clean['distance_maxabs'] = scaler_maxabs.fit_transform(df_clean[['distance']])

df_clean['arr_delay_minmax'] = scaler_minmax.fit_transform(df_clean[['arr_delay']])
df_clean['distance_minmax'] = scaler_minmax.fit_transform(df_clean[['distance']])

# View the results
df_clean[['arr_delay_standard', 'distance_standard', 'arr_delay_maxabs', 'distance_maxabs', 'arr_delay_minmax', 'distance_minmax']].describe()

'''
What are two pros and two cons of standardizing data?
**Pros:**
1. Standardization ensures variables contribute equally to distance-based algorithms (like KNN, clustering).
2. It helps improve the performance and convergence of gradient-based models (like linear regression, logistic regression, etc.).

**Cons:**
1. It may obscure the original scale, making interpretation of coefficients and outputs less intuitive.
2. If applied improperly (e.g., on categorical or already-normalized data), it can distort relationships and reduce model performance.
'''

**Exercise 6 - Transformation**

In [None]:
# Check summary statistics again for scaled variables
df_clean[['arr_delay_minmax', 'distance_minmax']].describe()

'''
 Why should you use the min-max scaled data instead of a different scaling for the transformations?  
 
 Min-Max scaling ensures all values fall within a **[0, 1]** range. This is important because:
- Log and square root transformations are **sensitive to negative values**, which could cause math errors.
- Min-Max scaling guarantees non-negative values, which is **required** for log and sqrt to work correctly.
- It also preserves the relationships between values, making it suitable for algorithms sensitive to the scale of data.
- This is especially important for algorithms like KNN, SVM, and neural networks that rely on distance metrics.
'''

# Check skewness first
arr_skew = df_clean['arr_delay'].skew()
dist_skew = df_clean['distance'].skew()

# Apply transformations based on skew direction
if arr_skew > 0:
    # Positive skew: log transformation (add constant to avoid log(0))
    df_clean['arr_delay_log'] = np.log1p(df_clean['arr_delay_minmax'])  # log(1 + x)
else:
    df_clean['arr_delay_log'] = df_clean['arr_delay_minmax']  # no transformation needed

if dist_skew < 0:
    # Negative skew: sqrt transformation
    df_clean['distance_sqrt'] = np.sqrt(df_clean['distance_minmax'])
else:
    df_clean['distance_sqrt'] = df_clean['distance_minmax']  # no transformation needed

'''
 Why do we have to add a constant when we perform a log or square-root transformation (i.e., np.log1p(df['column' + 1]))?

We add a constant (usually 1) because:
- You **can’t take the log or square root of 0 or negative numbers** — it’s mathematically undefined.
- Adding 1 (as in `np.log1p(x)` or `np.sqrt(x + 1)`) shifts all values just enough to avoid errors while keeping the relative scale of the data.
- It also ensures that the transformation is **smooth** and **continuous**,  
which is important for many statistical methods and machine learning algorithms.
'''

