#Importing Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

#Load dataset

In [2]:
data = pd.read_csv('synthetic_flight_data.csv')

In [3]:
data.head()

Unnamed: 0,Flight_ID,Origin_Airport,Destination_Airport,Flight_Duration,Flight_Date,Flight_Time,Passengers_Booked,No_show,Weather_Conditions,Special_Event,NonVeg_Meal,Veg_Meal,Jain_Meal,Meals_Loaded,Meals_Wasted
0,FLIGHT_077,LAX,FRA,11,2023-10-02,Night,260,7,Good,No,130,78,52,260,7
1,FLIGHT_019,SYD,JFK,5,2022-02-02,Afternoon,201,9,Good,No,100,60,40,200,9
2,FLIGHT_022,SIN,SYD,2,2023-11-19,Morning,197,9,Good,Yes,98,59,39,196,9
3,FLIGHT_033,DXB,JFK,12,2023-04-05,Morning,263,7,Good,No,131,78,52,261,7
4,FLIGHT_095,SIN,FRA,14,2023-04-15,Evening,60,13,Bad,No,30,18,12,60,19


#Feature Engineering

In [4]:
# Converting 'Weather_Conditions' to a binary variable: 1 for bad, 0 for good
data['Weather_Conditions'] = data['Weather_Conditions'].apply(lambda x: 1 if x == 'Bad' else 0)

# Encoding 'Flight_Time' as binary: 1 for odd hours (Early Morning, Night), 0 for other times
data['Odd_Hours'] = data['Flight_Time'].apply(lambda x: 1 if x in ['Early Hours', 'Night'] else 0)

In [5]:
data

Unnamed: 0,Flight_ID,Origin_Airport,Destination_Airport,Flight_Duration,Flight_Date,Flight_Time,Passengers_Booked,No_show,Weather_Conditions,Special_Event,NonVeg_Meal,Veg_Meal,Jain_Meal,Meals_Loaded,Meals_Wasted,Odd_Hours
0,FLIGHT_077,LAX,FRA,11,2023-10-02,Night,260,7,0,No,130,78,52,260,7,1
1,FLIGHT_019,SYD,JFK,5,2022-02-02,Afternoon,201,9,0,No,100,60,40,200,9,0
2,FLIGHT_022,SIN,SYD,2,2023-11-19,Morning,197,9,0,Yes,98,59,39,196,9,0
3,FLIGHT_033,DXB,JFK,12,2023-04-05,Morning,263,7,0,No,131,78,52,261,7,0
4,FLIGHT_095,SIN,FRA,14,2023-04-15,Evening,60,13,1,No,30,18,12,60,19,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,FLIGHT_034,JFK,LHR,12,2022-04-27,Night,172,20,1,Yes,86,51,34,171,30,1
99996,FLIGHT_002,FRA,SYD,4,2022-03-14,Evening,127,12,1,Yes,63,38,25,126,18,0
99997,FLIGHT_002,LHR,SYD,7,2022-05-27,Afternoon,194,19,1,Yes,97,58,38,193,28,0
99998,FLIGHT_036,YYZ,LAX,2,2022-05-04,Morning,58,10,0,No,29,17,11,57,10,0


In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [7]:
data['Special_Event'] = le.fit_transform(data['Special_Event'])

In [8]:
# Features and target variable
X = data[['Flight_Duration', 'Odd_Hours', 'Passengers_Booked', 'Meals_Loaded','Weather_Conditions','Special_Event']].values
y = data['No_show'].values

In [9]:
X

array([[ 11,   1, 260, 260,   0,   0],
       [  5,   0, 201, 200,   0,   0],
       [  2,   0, 197, 196,   0,   1],
       ...,
       [  7,   0, 194, 193,   1,   1],
       [  2,   0,  58,  57,   0,   0],
       [  3,   0,  92,  91,   0,   0]], dtype=int64)

In [10]:
y

array([ 7,  9,  9, ..., 19, 10,  5], dtype=int64)

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

# Building the Linear Regression Model

In [12]:
model = LinearRegression()
model.fit(X_train, y_train)

In [13]:
# Predict on testing set
y_pred = model.predict(X_test)

In [14]:
y_pred

array([14.98970569, 14.96109835, 14.99287083, ..., 14.9831139 ,
        4.98547616, 15.02974211])

# Calculate and print the mean squared error

In [15]:
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 10.017910517269897


In [16]:
from sklearn.metrics import r2_score

# Calculate R² score for training set
train_r2 = r2_score(y_train, model.predict(X_train))
print(f"R² score for the training set: {train_r2:.2f}")

# Calculate R² score for testing set
test_r2 = r2_score(y_test, y_pred)
print(f"R² score for the testing set: {test_r2:.2f}")

R² score for the training set: 0.71
R² score for the testing set: 0.71


### Using Random Forest Regressor

In [17]:
from sklearn.ensemble import RandomForestRegressor

In [18]:

random_forest_model = RandomForestRegressor(n_estimators=25, random_state=42)

In [19]:
# Train the model
random_forest_model.fit(X_train, y_train)

In [20]:
y_pred_rf = random_forest_model.predict(X_test)

In [21]:
y_pred_rf

array([16.93333333, 13.74666667, 15.8512381 , ..., 13.09298413,
        5.09661905, 15.70866667])

In [22]:
# Calculating mean squared error
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f"Mean Squared Error for Random Forest: {mse_rf}")

Mean Squared Error for Random Forest: 12.983733393826519


In [23]:
# Calculating R² score for the testing set
test_r2_rf = r2_score(y_test, y_pred_rf)
print(f"R² score for the testing set with Random Forest: {test_r2_rf}")

R² score for the testing set with Random Forest: 0.6285224712232169


# Using Standard scalar

In [24]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_scaled = sc.fit_transform(X)

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_scaled, y, test_size = 0.25, random_state = 7000)

In [26]:
model = LinearRegression()
model.fit(X_train, y_train)

In [27]:
y_pred = model.predict(X_test)

In [28]:
y_pred

array([14.97577512, 14.94717977, 14.95655052, ...,  5.00400709,
        4.97549197,  5.00232191])

In [29]:
# Calculating and printing the mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 9.982122587654688


In [30]:
from sklearn.metrics import r2_score

# Calculate R² score for training set
train_r2 = r2_score(y_train, model.predict(X_train))
print(f"R² score for the training set: {train_r2:.2f}")

# Calculate R² score for testing set
test_r2 = r2_score(y_test, y_pred)
print(f"R² score for the testing set: {test_r2:.2f}")

R² score for the training set: 0.71
R² score for the testing set: 0.72


In [31]:
#'Flight_Duration', 'Odd_Hours', 'Passengers_Booked', 'Meals_Loaded','Weather_Conditions','Special_Event'
new_flight_example = [[4, 0, 110, 160, 0, 0]]  # An example input array #weather conditions 0 good 1 bad
                                               # 0 not an odd hour,1 is an odd hour
 
# Use the model to make a prediction
predicted_no_show = model.predict(new_flight_example)

print(f"Predicted No-shows: {predicted_no_show[0]:.0f}")

Predicted No-shows: 1
