In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Loading Dataset (Data Loading)
data_train = pd.read_csv("Data_Train.csv")
test_set = pd.read_csv("Test_set.csv")

# Copying dataset for preprocessing
df = data_train.copy()

# Basic Summary Statistics (Data Exploration)
print(df.describe())  # Summary statistics for numerical columns

ModuleNotFoundError: No module named 'xgboost'

In [None]:
print(df.info()) # Information about data types and missing values

In [None]:
# Checking for missing values
print(df.isnull().sum())  # Counting missing values for each column

In [None]:
# Checking data types and unique values for categorical columns
print(df['Airline'].value_counts())

In [None]:
print(df['Source'].value_counts())

In [None]:
print(df['Destination'].value_counts())

In [None]:
# Converting 'Date_of_Journey' to datetime (Data cleaning)
df['Date_of_Journey'] = pd.to_datetime(df['Date_of_Journey'], format='%d/%m/%Y')

# Converting 'Dep_Time' to datetime (Assuming that the departure happens on Date_of_Journey)
df['Dep_Time'] = pd.to_datetime(df['Date_of_Journey'].astype(str) + " " + df['Dep_Time'], format='%Y-%m-%d %H:%M')

# Function to handle inconsistent 'Arrival_Time' formats
from dateutil import parser

def process_arrival_time(row):
    arrival = row['Arrival_Time'].strip()
    
    try:
        # parsing as full datetime
        return pd.to_datetime(arrival, format='%d/%m/%Y %H:%M')
    except ValueError:
        pass  # If it fails, then it proceed to next step

    try:
        # If only time is provided, assume same day as departure
        return pd.to_datetime(f"{row['Date_of_Journey'].strftime('%Y-%m-%d')} {arrival}", format='%Y-%m-%d %H:%M')
    except ValueError:
        pass  # If it fails, then it proceed to next step

    try:
        # If the format is like "HH:MM dd MMM", try parsing it with dateutil
        parsed_dt = parser.parse(arrival, dayfirst=True)
        return pd.to_datetime(f"{row['Date_of_Journey'].year}-{parsed_dt.strftime('%m-%d')} {parsed_dt.strftime('%H:%M')}")
    except ValueError:
        raise ValueError(f"Unrecognized format in Arrival_Time: {arrival}")

# Applying function
df['Arrival_Time'] = df.apply(process_arrival_time, axis=1)


# Extracting useful features from date and time (Feature Engineering)
df['Journey_Day'] = df['Date_of_Journey'].dt.day
df['Journey_Month'] = df['Date_of_Journey'].dt.month
df['Dep_Hour'] = df['Dep_Time'].dt.hour
df['Dep_Minute'] = df['Dep_Time'].dt.minute
df['Arrival_Hour'] = df['Arrival_Time'].dt.hour
df['Arrival_Minute'] = df['Arrival_Time'].dt.minute

# Droping original datetime columns
df.drop(columns=['Date_of_Journey', 'Dep_Time', 'Arrival_Time'], inplace=True)

# Processing 'Duration' into numerical values
df['Duration_Hours'] = df['Duration'].str.extract(r'(\d+)h', expand=False).astype(float).fillna(0)
df['Duration_Minutes'] = df['Duration'].str.extract(r'(\d+)m', expand=False).astype(float).fillna(0)
df['Total_Duration_Minutes'] = df['Duration_Hours'] * 60 + df['Duration_Minutes']
df.drop(columns=['Duration', 'Duration_Hours', 'Duration_Minutes'], inplace=True)
df

In [None]:
# Defining categorical and numerical features
categorical_features = ['Airline', 'Source', 'Destination', 'Total_Stops']
numerical_features = ['Journey_Day', 'Journey_Month', 'Dep_Hour', 'Dep_Minute', 'Arrival_Hour', 'Arrival_Minute', 'Total_Duration_Minutes']

In [None]:
# Histogram for numerical features (e.g., 'Total_Duration_Minutes', 'Price') (Data visualization)
plt.figure(figsize=(12, 6))
sns.histplot(df['Total_Duration_Minutes'], kde=True, color='blue')
plt.title('Distribution of Total Duration in Minutes')
plt.show()

In [None]:
# Correlation Heatmap for numerical features (find relationships)
plt.figure(figsize=(12, 8))
sns.heatmap(df[numerical_features].corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Numerical Features')
plt.show()

In [None]:
# Pairplot for relationships between key features and the target 'Price'
sns.pairplot(df, vars=['Journey_Day', 'Journey_Month', 'Total_Duration_Minutes', 'Price'], kind='scatter')
plt.show()

In [None]:
# Boxplot for categorical features vs Price (to see if there's a pattern)
plt.figure(figsize=(12, 6))
sns.boxplot(x='Airline', y='Price', data=df)
plt.xticks(rotation=45)
plt.title('Price Distribution Across Airlines')
plt.show()

In [None]:
# Scatter plot for numerical feature relationships (e.g., Duration vs Price)
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Total_Duration_Minutes', y='Price', data=df)
plt.title('Price vs Total Duration')
plt.show()

In [None]:
# Preprocessing pipeline (Data preprocessing)
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

# Defining input (X) and target (y)
X = df.drop(columns=['Price'])
y = df['Price']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Linear Regression (Models Training)
linear_regressor = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])
linear_regressor.fit(X_train, y_train)
linear_predictions = linear_regressor.predict(X_test)
print("Linear Regression Performance:")
print("RMSE:", np.sqrt(mean_squared_error(y_test, linear_predictions))) # prediction and evaluation
print("R² Score:", r2_score(y_test, linear_predictions))
print("MAPE:", mean_absolute_percentage_error(y_test, linear_predictions))

In [None]:
# Random Forest Regressor
rf_regressor = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])
rf_regressor.fit(X_train, y_train)
rf_predictions = rf_regressor.predict(X_test)
print("Random Forest Performance:")
print("RMSE:", np.sqrt(mean_squared_error(y_test, rf_predictions)))
print("R² Score:", r2_score(y_test, rf_predictions))
print("MAPE:", mean_absolute_percentage_error(y_test, rf_predictions))

In [None]:
# XGBoost Regressor
xgb_regressor = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42))
])
xgb_regressor.fit(X_train, y_train)
xgb_predictions = xgb_regressor.predict(X_test)
print("XGBoost Performance:")
print("RMSE:", np.sqrt(mean_squared_error(y_test, xgb_predictions)))
print("R² Score:", r2_score(y_test, xgb_predictions))
print("MAPE:", mean_absolute_percentage_error(y_test, xgb_predictions))

In [None]:
# Neural Network Model
def build_nn(input_shape):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_shape,)),
        Dense(32, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

nn_model = build_nn(X_train_transformed.shape[1])
nn_model.fit(X_train_transformed, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)
nn_predictions = nn_model.predict(X_test_transformed)
print("Neural Network Performance:")
print("RMSE:", np.sqrt(mean_squared_error(y_test, nn_predictions)))
print("R² Score:", r2_score(y_test, nn_predictions))
print("MAPE:", mean_absolute_percentage_error(y_test, nn_predictions))

In [None]:
from sklearn.metrics import mean_absolute_error

# Creating a dictionary to store the results
results = {
    'Model': ['Linear Regression', 'Random Forest', 'XGBoost', 'Neural Network'],
    'RMSE': [
        np.sqrt(mean_squared_error(y_test, linear_predictions)),
        np.sqrt(mean_squared_error(y_test, rf_predictions)),
        np.sqrt(mean_squared_error(y_test, xgb_predictions)),
        np.sqrt(mean_squared_error(y_test, nn_predictions))
    ],
    'R² Score': [
        r2_score(y_test, linear_predictions),
        r2_score(y_test, rf_predictions),
        r2_score(y_test, xgb_predictions),
        r2_score(y_test, nn_predictions)
    ],
    'MAPE': [
        mean_absolute_percentage_error(y_test, linear_predictions),
        mean_absolute_percentage_error(y_test, rf_predictions),
        mean_absolute_percentage_error(y_test, xgb_predictions),
        mean_absolute_percentage_error(y_test, nn_predictions)
    ],
    'Accuracy': [
        1 - (mean_absolute_error(y_test, linear_predictions) / y_test.mean()),  # Similar to accuracy
        1 - (mean_absolute_error(y_test, rf_predictions) / y_test.mean()), 
        1 - (mean_absolute_error(y_test, xgb_predictions) / y_test.mean()), 
        1 - (mean_absolute_error(y_test, nn_predictions) / y_test.mean()) 
    ] # Accuracy = 1−(Mean of Actual Values/Mean Absolute Error)
}

# Creating a DataFrame from the dictionary
metrics_df = pd.DataFrame(results)

# Displaying the results table
print(metrics_df)