# Airline Profit Prediction Model Development

This notebook demonstrates the development of the airline profit prediction model.

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seed for reproducibility
np.random.seed(42)

In [2]:
# Load and prepare data
def load_data(file_path):
    try:
        df = pd.read_csv(file_path)
        print("Data loaded successfully")
        return df
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return None

# Load the dataset
df = load_data('../data/raw/airline_data.csv')
df.head()

Data loaded successfully


Unnamed: 0,Revenue,Operating_Cost,Load_Factor,Aircraft_Utilization,Maintenance_Downtime,Fleet_Availability,Profit


In [3]:
# Data preprocessing
def preprocess_data(df):
    # Handle missing values
    df = df.fillna(df.mean())
    
    # Feature engineering
    df['Profit_Margin'] = (df['Revenue'] - df['Operating_Cost']) / df['Revenue']
    
    return df

# Preprocess the data
df_processed = preprocess_data(df)
print("Data preprocessing completed")

Data preprocessing completed


In [4]:
# Exploratory Data Analysis
def plot_correlations(df):
    plt.figure(figsize=(12, 8))
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
    plt.title('Feature Correlations')
    plt.show()

plot_correlations(df_processed)

In [5]:
# Model Development
def train_model(X, y):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Train model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    return model, mse, r2, X_train, X_test, y_train, y_test

# Prepare features and target
features = ['Revenue', 'Operating_Cost', 'Load_Factor', 'Aircraft_Utilization',
           'Maintenance_Downtime', 'Fleet_Availability']
X = df_processed[features]
y = df_processed['Profit']

# Train the model
model, mse, r2, X_train, X_test, y_train, y_test = train_model(X, y)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.2f}")

Mean Squared Error: 0.00
R² Score: 0.95


In [6]:
# Feature Importance Analysis
def plot_feature_importance(model, X):
    importance = model.feature_importances_
    features = X.columns
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x=importance, y=features)
    plt.title('Feature Importance')
    plt.xlabel('Importance Score')
    plt.show()

plot_feature_importance(model, X)

In [7]:
# Model Validation
def plot_predictions(y_test, y_pred):
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel('Actual Profit')
    plt.ylabel('Predicted Profit')
    plt.title('Actual vs Predicted Profit')
    plt.show()

y_pred = model.predict(X_test)
plot_predictions(y_test, y_pred)