In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df_data = pd.read_excel('airlines.xlsx')

In [None]:
df_data.dropna(inplace=True)

In [None]:
print(df_data.isnull().sum())

In [None]:
df_data['Date_of_Journey'] = pd.to_datetime(df_data['Date_of_Journey'], format='%d/%m/%Y')
df_data['Dep_Time'] = pd.to_datetime(df_data['Dep_Time'])
df_data['Arrival_Time'] = pd.to_datetime(df_data['Arrival_Time'])

In [None]:
df_data['Journey_month'] = df_data['Date_of_Journey'].dt.month
df_data['Journey_day'] = df_data['Date_of_Journey'].dt.day
df_data['Journey_year'] = df_data['Date_of_Journey'].dt.year
df_data['Dep_Time_hour'] = df_data['Dep_Time'].dt.hour
df_data['Dep_Time_minute'] = df_data['Dep_Time'].dt.minute
df_data['Arrival_Time_hour'] = df_data['Arrival_Time'].dt.hour
df_data['Arrival_Time_minute'] = df_data['Arrival_Time'].dt.minute

In [None]:
def categorize_dep_time(hour):
    if 4 <= hour < 8:
        return "Early Morning"
    elif 8 <= hour < 12:
        return "Morning"
    elif 12 <= hour < 16:
        return "Noon"
    elif 16 <= hour < 20:
        return "Evening"
    elif 20 <= hour < 24:
        return "Night"
    else:
        return "Late Night"

df_data['dep_description'] = df_data['Dep_Time_hour'].apply(categorize_dep_time)

dep_counts = df_data['dep_description'].value_counts()
dep_counts.plot(kind='bar')
plt.xlabel('Dep_Description')
plt.ylabel('Number of Flights')
plt.title('Number of Flights by Departure Description')
plt.show()


In [None]:
df_data['Duration'] = df_data['Arrival_Time'] - df_data['Dep_Time']
df_data['Duration_hours'] = df_data['Duration'].dt.components.hours
df_data['Duration_mins'] = df_data['Duration'].dt.components.minutes
df_data['Duration_total_mins'] = df_data['Duration'].dt.total_seconds() / 60

df_data[['Duration_hours', 'Duration_mins', 'Duration_total_mins']].head()

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(df_data['Duration_total_mins'], df_data['Price'], alpha=0.5)

slope, intercept = np.polyfit(df_data['Duration_total_mins'], df_data['Price'], 1)
plt.plot(df_data['Duration_total_mins'], slope * df_data['Duration_total_mins'] + intercept, color='red')

plt.xlabel('Duration_total_mins')
plt.ylabel('Price')
plt.title('Price vs. Duration with Regression Line')
plt.show()


plt.figure(figsize=(10, 6))
stops = df_data['Total_Stops'].unique()

for stop in stops:
    subset = df_data[df_data['Total_Stops'] == stop]
    plt.scatter(subset['Duration_total_mins'], subset['Price'], label=f'{stop} stop(s)', alpha=0.5)

plt.xlabel('Duration (minutes)')
plt.ylabel('Price')
plt.title('Price vs. Duration by Number of Stops')
plt.legend()
plt.show()

#Regression line: Price = -0.03 * Duration_total_mins + 8861.35

In [None]:
jet_airways_data = df_data[df_data['Airline'] == 'Jet Airways']

jet_airways_data['Route'] = jet_airways_data['Source'] + ' to ' + jet_airways_data['Destination']
most_used_routes = jet_airways_data['Route'].value_counts()

plt.figure(figsize=(12, 8))
most_used_routes.plot(kind='bar', color='skyblue')
plt.xlabel('Route')
plt.ylabel('Number of Flights')
plt.title('Most Used Routes for Jet Airways')
plt.xticks(rotation=45)
plt.show()
#The most used route for Jet Airways is: Delhi to Cochin

In [None]:
import patsy
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

y, X = patsy.dmatrices('Price ~ Duration_total_mins', data=df_data, return_type='dataframe')

model = LinearRegression()
model.fit(X, y)

y_pred = model.predict(X)

r2 = r2_score(y, y_pred)
print(f'R² score: {r2:.4f}')

plt.figure(figsize=(10, 6))
plt.scatter(df_data['Duration_total_mins'], df_data['Price'], alpha=0.5, label='Data points')
plt.plot(df_data['Duration_total_mins'], y_pred, color='red', label='Regression line')
plt.xlabel('Duration (minutes)')
plt.ylabel('Price')
plt.title('Price vs. Duration with Regression Line (Patsy and Sklearn)')
plt.legend()
plt.show()
# The low r2 value, 0.25711595260146614, indicates a weak correlation between the duration and price
# The positive slope of the line shows a positive correlation between duration and price



In [None]:
from sklearn.model_selection import train_test_split
df_data["Journey_weekday"] = df_data["Date_of_Journey"].dt.day_name()
y, X = patsy.dmatrices('Price ~ Airline * Source * Destination * Total_Stops * dep_description * Journey_month * Journey_weekday', data=df_data, return_type='dataframe')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_complex = LinearRegression()
model_complex.fit(X_train, y_train)

y_pred_complex = model_complex.predict(X_test)

r2_complex = r2_score(y_test, y_pred_complex)
print(f'Complex model R² score: {r2_complex:.4f}')

# r^2 = 0.7423319623114062
# this r^2 is much higher than the previous r2 value found. There is a strong correlation between the variables.


In [None]:
y, X = patsy.dmatrices('Price ~ Airline + Source + Destination + Total_Stops + dep_description + Journey_month + Journey_weekday', data=df_data, return_type='dataframe')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_different = LinearRegression()
model_different.fit(X_train, y_train)

y_pred_different = model_different.predict(X_test)

r2_different = r2_score(y_test, y_pred_different)
print(f'Different model R² score: {r2_different:.4f}')
