<a href="https://colab.research.google.com/github/Varunaqua2004/1st-Repository/blob/main/ML_Project1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Load the datasets
from google.colab import drive
drive.mount('/content/drive')

# Now you can access the file from your Google Drive
file_train  = '/content/drive/MyDrive/EDA Project 1/Data_Train.xlsx'  # Replace 'your_file.csv' with your file name
train_df = pd.read_excel(file_train)

file_test  = '/content/drive/MyDrive/EDA Project 1/Test_set.xlsx'  # Replace 'your_file.csv' with your file name
test_df = pd.read_excel(file_test)


Mounted at /content/drive


In [3]:
#Display first few rows -
train_df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [4]:
test_df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info
0,Jet Airways,6/06/2019,Delhi,Cochin,DEL → BOM → COK,17:30,04:25 07 Jun,10h 55m,1 stop,No info
1,IndiGo,12/05/2019,Kolkata,Banglore,CCU → MAA → BLR,06:20,10:20,4h,1 stop,No info
2,Jet Airways,21/05/2019,Delhi,Cochin,DEL → BOM → COK,19:15,19:00 22 May,23h 45m,1 stop,In-flight meal not included
3,Multiple carriers,21/05/2019,Delhi,Cochin,DEL → BOM → COK,08:00,21:00,13h,1 stop,No info
4,Air Asia,24/06/2019,Banglore,Delhi,BLR → DEL,23:55,02:45 25 Jun,2h 50m,non-stop,No info


In [5]:
#Data Preprocessing
# Handle missing values
train_df.dropna(inplace=True)

In [6]:
#Separate features(x) and Target Y
X = train_df.drop('Price', axis=1)
y = train_df['Price']

In [7]:
# Identify Categorical Data and Numerical Data
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

In [8]:
#Preprocessing Pipeline : -
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)


In [9]:
#Define the model(Random Forest Regressor in this case)

model = RandomForestRegressor(n_estimators=100, random_state=42)

In [10]:
# Create a Pipeline : -
pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

In [11]:
# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Train the model
pipeline.fit(X_train, y_train)

In [14]:
# Evaluate the model on the validation set
y_pred = pipeline.predict(X_val)
print(f"Mean Squared Error: {mean_squared_error(y_val, y_pred)}")
print(f"R-Squared Value: {r2_score(y_val, y_pred)}")

Mean Squared Error: 1733929.251182349
R-Squared Value: 0.919584235143688


In [15]:
#Predict Flight Prices for the test dataset
test_predictions = pipeline.predict(test_df)

In [17]:
#Save the predictions to a new file
test_df['Predicted-Price'] = test_predictions
test_df.to_excel('predicted_flight_prices.xlsx', index=False)
print("Predictions saved to a new file 'predicted_flight_prices.xlsx'")

Predictions saved to a new file 'predicted_flight_prices.xlsx'


In [18]:
final_df = pd.read_excel('predicted_flight_prices.xlsx')
final_df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Predicted-Price
0,Jet Airways,6/06/2019,Delhi,Cochin,DEL → BOM → COK,17:30,04:25 07 Jun,10h 55m,1 stop,No info,14714.0
1,IndiGo,12/05/2019,Kolkata,Banglore,CCU → MAA → BLR,06:20,10:20,4h,1 stop,No info,4336.5
2,Jet Airways,21/05/2019,Delhi,Cochin,DEL → BOM → COK,19:15,19:00 22 May,23h 45m,1 stop,In-flight meal not included,12898.0
3,Multiple carriers,21/05/2019,Delhi,Cochin,DEL → BOM → COK,08:00,21:00,13h,1 stop,No info,13179.518333
4,Air Asia,24/06/2019,Banglore,Delhi,BLR → DEL,23:55,02:45 25 Jun,2h 50m,non-stop,No info,3984.38
