In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)
data=pd.read_csv("Flight_Fare.csv")
data.info()
data.describe()

import sweetviz as sv # library for univariant analysis
my_report=sv.analyze(data) # passing the original dataframe
my_report.show_html() # arguments will generate to// the library 

plt.figure(figsize=(12, 6))
plt.bar(data['Airline'],data['Price'])
plt.title('Price by Airline')
plt.xlabel('Airline')
plt.ylabel('Price')
plt.xticks(rotation=60)  # Rotate x-axis labels for better readability
plt.show()

plt.figure(figsize=(12, 6))
plt.bar(data['Source'],data['Price'])
plt.title('Price-Source')
plt.xlabel('Source')
plt.ylabel('Price')
plt.xticks(rotation=45)  
plt.show

plt.figure(figsize=(12, 6))
plt.bar(data['Destination'],data['Price'])
plt.title('Price-Destination')
plt.xlabel('Destination')
plt.ylabel('Price')
plt.xticks(rotation=45)  

sns.relplot(x='Total_Stops', y='Price', data=data)
plt.show()

sns.pairplot(data)


data = pd.read_csv("Flight_Fare.csv")  
 
# Convert loaded data to DataFrame if it's a list
if isinstance(data, list):
    data = pd.DataFrame(data)

# Function to convert duration to minutes
def convert_to_minutes(duration):
    parts = duration.split()
    hours = 0
    minutes = 0
    for part in parts:
        if 'h' in part:
            hours = int(part.strip('h'))
        elif 'm' in part:
            minutes = int(part.strip('m'))
    return hours * 60 + minutes

# Apply conversion to 'Duration' column in the dataset
data['Duration'] = data['Duration'].apply(convert_to_minutes)

# Rename the column to reflect the conversion
data.rename(columns={'Duration': 'duration_minutes'}, inplace=True)

# Display the updated DataFrame
print(data)

# Optionally, save the updated dataset to a new file
# data.to_csv('updated_dataset.csv', index=False)  # Replace 'updated_dataset.csv' with the desired file name

data.Destination=data.Destination.map({'Cochin':'0','Banglore':'1','Delhi':'2','New Delhi':'3','Hyderabad':'4','Kolkata':'5'})
data.head()

data.Source=data.Source.map({'Delhi':'0','Kolkata':'1','Banglore':'2','Mumbai':'3','Chennai':'4'})
data.head()

data.Airline=data.Airline.map({'Jet Airways':'0','IndiGo':'1','Air India':'2','Multiple carriers':'3','SpiceJet':'4','Vistara':'5','Air Asia':'6','GoAir':'7','Multiple carriers Premium economy':'8','Jet Airways Business':'9','Vistara Premium economy':'10','Trujet':'11'})
data.head()

data.Additional_Info=data.Additional_Info.map({'No info':'0','No Info':'0','In-flight meal not included':'1','No check-in baggage included':'2','1 Long layover':'3','Change airports':'4','Business class':'5','1 Short layover':'6','Red-eye flight':'7','2 Long layover':'8'})
data.head()

data_encoder=data
from sklearn.preprocessing import LabelEncoder
lc=LabelEncoder()
# Total Stops
data_encoder.Total_Stops=lc.fit_transform(data_encoder.Total_Stops)
# Departure Time
data_encoder.Dep_Time=lc.fit_transform(data_encoder.Dep_Time)
# Arrival Time 
data_encoder.Arrival_Time=lc.fit_transform(data_encoder.Arrival_Time)

data[['Day', 'Month', 'Year']] = data['Date_of_Journey'].str.split('/', expand=True)

# Convert to numeric format if needed
data['Day'] = pd.to_numeric(data['Day'])
data['Month'] = pd.to_numeric(data['Month'])
data['Year'] = pd.to_numeric(data['Year'])

data.drop(columns=['Date_of_Journey'], inplace=True)
data.drop(columns=['Route'], inplace=True)

corr_data=data[['Airline','Source','Destination','Dep_Time','Arrival_Time','duration_minutes','Total_Stops','Day','Month','Price']]

X=data.drop('Price',axis=1)
y=data.Price

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.preprocessing import MinMaxScaler 
scale=MinMaxScaler()

X_train=scale.fit_transform(X_train)
X_test=scale.fit_transform(X_test)

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, r2_score


# Creating a linear regression model
model = LinearRegression()

# Training the model on the training data
model.fit(X_train, y_train)

# Making predictions on the testing data
predictions = model.predict(X_test)

# Calculating and printing mean squared error (MSE)
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

# Calculating and printing R-squared
r_squared = r2_score(y_test, predictions)
print("R-squared:", r_squared)

from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import mean_squared_error, r2_score



# Creating a Gradient Boosting Regressor
gbm = GradientBoostingRegressor()

# Training the model on the training data
gbm.fit(X_train, y_train)

# Making predictions on the testing data
y_pred = gbm.predict(X_test)

# Calculating mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Calculating R-squared
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score


# Creating a Gradient Boosting Regressor
gbm = GradientBoostingRegressor()

# Define the grid of hyperparameters to search
param_grid = {
    'n_estimators': [50, 100, 150],  # Number of boosting stages to be run
    'learning_rate': [0.05, 0.1, 0.2],  # Learning rate shrinks the contribution of each tree
    'max_depth': [3, 4, 5]  # Maximum depth of the individual estimators
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=gbm, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best model for predictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error, r2_score



# Creating a Support Vector Machine (SVM) regression model
svm = SVR(kernel='linear')  # You can choose different kernels like 'linear', 'poly', 'rbf', etc.

# Training the model on the training data
svm.fit(X_train, y_train)

# Making predictions on the testing data
predictions = svm.predict(X_test)

# Calculating mean squared error
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

# Calculating R-squared
r2 = r2_score(y_test, predictions)
print("R-squared:", r2)

from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score


# Define the grid of hyperparameters to search
param_grid = {
    'kernel': ['linear', 'rbf'],  # Kernel type
    'C': [0.1, 1, 10, 100],        # Regularization parameter
    'gamma': ['scale', 'auto']     # Kernel coefficient for 'rbf' kernel
}

# Creating a Support Vector Machine (SVM) regression model
svm = SVR()

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best model for predictions
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

# Calculate R-squared
r2 = r2_score(y_test, predictions)
print("R-squared:", r2)

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


# Creating a Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)  # You can adjust the number of estimators as needed

# Fitting the model on the training data
rf.fit(X_train, y_train)

# Making predictions on the testing data
predictions = rf.predict(X_test)

# Calculating mean squared error
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

# Calculating R-squared
r2 = r2_score(y_test, predictions)
print("R-squared:", r2)

import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


# Encode categorical features
data_encoded = pd.get_dummies(data, columns=['Airline', 'Source', 'Destination', 'Additional_Info'])

# Splitting the encoded data into training and testing sets
X = data_encoded.drop(columns=['Price'])
y = data_encoded['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Setting XGBoost parameters
params = {
    'objective': 'reg:squarederror',  # Use regression objective function
    'eval_metric': 'rmse'             # Use root mean squared error (RMSE) as evaluation metric
}

# Training the model
num_rounds = 100  # Number of boosting rounds
xg_reg = xgb.train(params, dtrain, num_rounds)

# Making predictions on the testing data
predictions = xg_reg.predict(dtest)

# Calculating mean squared error
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

# Calculating R-squared
r2 = r2_score(y_test, predictions)
print("R-squared:", r2)

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split



# Creating a Decision Tree Regressor
dt = DecisionTreeRegressor(random_state=42)  # You can adjust hyperparameters like max_depth, min_samples_split, etc.

# Fitting the model on the training data
dt.fit(X_train, y_train)

# Making predictions on the testing data
predictions = dt.predict(X_test)

# Calculating mean squared error
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

# Calculating R-squared
r2 = r2_score(y_test, predictions)
print("R-squared:", r2)

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score


# Creating a Decision Tree Regressor
dt = DecisionTreeRegressor(random_state=42)

# Define the grid of hyperparameters to search
param_grid = {
    'max_depth': [3, 4, 5, None],             # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],          # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]             # Minimum number of samples required to be at a leaf node
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best model for predictions
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

# Calculate R-squared
r2 = r2_score(y_test, predictions)
print("R-squared:", r2)