In [14]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

# 1. Load and Explore Data
df = pd.read_csv("economy.csv") # Replace with your file name
print(df.head())
print(df.info()) 
print(df.describe()) 

# 2. Data Preprocessing and Feature Engineering

# a. Convert Time Features to Numerical
def convert_time(time_str):
    try:
        hours, minutes = map(int, time_str.split(':'))
        return hours * 60 + minutes  # Time in minutes
    except: # Handle cases where time format is not proper
        return np.nan

df['dep_time_minutes'] = df['dep_time'].apply(convert_time)
df['arr_time_minutes'] = df['arr_time'].apply(convert_time)

# Extract total time taken in minutes
def extract_time_taken(time_str):
    try:
        parts = time_str.split(' ')
        total_minutes = 0
        for i in range(len(parts) - 1):  # Exclude the 'h' or 'm' unit
            if parts[i].isdigit():
                if parts[i+1] == 'h':
                  total_minutes += int(parts[i]) * 60
                elif parts[i+1] == 'm':
                  total_minutes += int(parts[i])
        return total_minutes
    except: # Handle cases where time format is not proper
        return np.nan

df['time_taken_minutes'] = df['time_taken'].apply(extract_time_taken)

df.drop(['dep_time', 'arr_time', 'time_taken'], axis=1, inplace=True)  # Drop original time columns

# b. Convert Stop to Numerical (Ordinal Encoding)
stop_mapping = {'non-stop': 0, '1 stop': 1, '2 stops': 2, '3 stops':3, '4 stops':4} # Add more as per your data
df['stop'] = df['stop'].map(stop_mapping)

# c. One-Hot Encode Categorical Features (airline, from, to)
categorical_features = ['airline', 'from', 'to']
df.drop(['ch_code', 'num_code', 'date'], axis=1, inplace=True)

# 3. Define Features (X) and Target (y)
X = df.drop('price', axis=1)
y = df['price']

# 4. Create Preprocessor and Pipeline
numerical_features = ['dep_time_minutes', 'arr_time_minutes', 'time_taken_minutes', 'stop'] # Add any other numerical features you create
categorical_transformer = OneHotEncoder(handle_unknown='ignore') # handles cases where test set has categories not in training set
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', LinearRegression())])

# 5. Train/Test Split and Model Training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)

# 6. Make Predictions and Evaluate
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

# 7. Example Prediction (Important!)
new_data = pd.DataFrame({
    'airline': ['SpiceJet'],
    'stop': ['non-stop'],
    'from': ['Delhi'],
    'to': ['Mumbai'],
    'dep_time': ['18:55'],
    'arr_time': ['21:05'],
    'time_taken': ['02h 10m']
})

# Preprocess the new data in the same way as the training data
new_data['dep_time_minutes'] = new_data['dep_time'].apply(convert_time)
new_data['arr_time_minutes'] = new_data['arr_time'].apply(convert_time)
new_data['time_taken_minutes'] = new_data['time_taken'].apply(extract_time_taken)
new_data.drop(['dep_time', 'arr_time', 'time_taken'], axis=1, inplace=True)

new_data['stop'] = new_data['stop'].map(stop_mapping) # Mapping stops

predicted_price = pipeline.predict(new_data)
print(f"Predicted Price: {predicted_price}")

  date   airline ch_code  num_code dep_time   from time_taken       stop  \
0  Fri  SpiceJet      SG      8709    18:55  Delhi    02h 10m  non-stop    
1  Fri  SpiceJet      SG      8157    06:20  Delhi    02h 20m  non-stop    
2  Fri   AirAsia      I5       764    04:25  Delhi    02h 10m  non-stop    
3  Fri   Vistara      UK       995    10:20  Delhi    02h 15m  non-stop    
4  Fri   Vistara      UK       963    08:50  Delhi    02h 20m  non-stop    

  arr_time      to  price  
0    21:05  Mumbai  5,953  
1    08:40  Mumbai  5,953  
2    06:35  Mumbai  5,956  
3    12:35  Mumbai  5,955  
4    11:10  Mumbai  5,955  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206774 entries, 0 to 206773
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   date        206774 non-null  object
 1   airline     206774 non-null  object
 2   ch_code     206774 non-null  object
 3   num_code    206774 non-null  int64 
 4   dep_time    2

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values