In [36]:
from sklearn.ensemble import VotingRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
import pickle
import dice_ml
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv("DataSets/X_train.csv")
dft = pd.read_csv("DataSets/X_test.csv")

In [3]:
df['flight_code']=df.flight.apply(lambda x: x.split("-")[0])
df['flight_no']=df.flight.apply(lambda x: x.split("-")[1])
df['flight_no']=df['flight_no'].astype(int)
dft['flight_code']=dft.flight.apply(lambda x: x.split("-")[0])
dft['flight_no']=dft.flight.apply(lambda x: x.split("-")[1])
dft['flight_no']=dft['flight_no'].astype(int)

In [4]:
X = df.drop(columns=['flightId', 'flight', 'price'])
X['airline']=X.airline.map(dict(Vistara=0, Air_India=1, GO_FIRST=2, Indigo=3, AirAsia=4, SpiceJet=5))
X['source_city']=X.source_city.map(dict(Mumbai=0, Delhi=1, Bangalore=2, Kolkata=3, Hyderabad=4, Chennai=5))
X['destination_city']=X.destination_city.map(dict(Mumbai=0, Delhi=1, Bangalore=2, Kolkata=3, Hyderabad=4, Chennai=5))
X['departure_time']=X.departure_time.map(dict(Morning=0, Early_Morning=1, Evening=2, Night=3, Afternoon=4, Late_Night=5))
X['arrival_time']=X.arrival_time.map(dict(Morning=0, Early_Morning=1, Evening=2, Night=3, Afternoon=4, Late_Night=5))
X['stops']=X.stops.map(dict(zero=0, one=1, two_or_more=2))
X['flight_code']=X.flight_code.map(dict(UK=0, AI=1, G8=2, E6=3, I5=4, SG=5))

In [5]:
Xt = dft.drop(columns=['flightId', 'flight', 'price'])
Xt['airline']=Xt.airline.map(dict(Vistara=0, Air_India=1, GO_FIRST=2, Indigo=3, AirAsia=4, SpiceJet=5))
Xt['source_city']=Xt.source_city.map(dict(Mumbai=0, Delhi=1, Bangalore=2, Kolkata=3, Hyderabad=4, Chennai=5))
Xt['destination_city']=Xt.destination_city.map(dict(Mumbai=0, Delhi=1, Bangalore=2, Kolkata=3, Hyderabad=4, Chennai=5))
Xt['departure_time']=Xt.departure_time.map(dict(Morning=0, Early_Morning=1, Evening=2, Night=3, Afternoon=4, Late_Night=5))
Xt['arrival_time']=Xt.arrival_time.map(dict(Morning=0, Early_Morning=1, Evening=2, Night=3, Afternoon=4, Late_Night=5))
Xt['stops']=Xt.stops.map(dict(zero=0, one=1, two_or_more=2))
Xt['flight_code']=Xt.flight_code.map(dict(UK=0, AI=1, G8=2, E6=3, I5=4, SG=5))

In [6]:
y = df[['price']]

gb=GradientBoostingRegressor(max_depth=5, max_features=4, n_estimators=2300, learning_rate=0.056, random_state=0)
xgb=XGBRegressor(n_estimators=1300, learning_rate=0.04, n_jobs=-1)
lgbm=lgb.LGBMRegressor(learning_rate=0.09, n_estimators=5800, max_depth=4)
vot=VotingRegressor([('gb', gb), ('xgb', xgb), ('lgb', lgbm)])

rgs = [
    ('Gradient Boosting', gb), 
    ('XGBoost', xgb),
    ('LightGBM', lgbm),
    ('Voting Regressor', vot)
]

for name,rg in rgs:
    rg.fit(X,y)
    yt_pred = rg.predict(Xt)
    print("%s || R2 = %.4f and MSE = %.2f" % (name, r2_score(dft['price'],yt_pred), mean_squared_error(dft['price'], yt_pred)))

#Gradient Boosting || R2 = 0.8194 and MSE = 31978343.37
#XGBoost || R2 = 0.8140 and MSE = 32941617.74
#LightGBM || R2 = 0.7928 and MSE = 36696315.08
#Voting Regressor || R2 = 0.8193 and MSE = 31995161.06

  y = column_or_1d(y, warn=True)


Gradient Boosting || R2 = 0.8194 and MSE = 31978343.37
XGBoost || R2 = 0.8140 and MSE = 32941617.74
LightGBM || R2 = 0.7928 and MSE = 36696315.08


  y = column_or_1d(y, warn=True)


Voting Regressor || R2 = 0.8193 and MSE = 31995161.06


In [None]:
#streamlit model saving
reg=vot.fit(X,y)
model = {
    'model': reg,
    'X':X
}
pickle.dump(model, open('flights_model.sav', 'wb'))

In [92]:
dfc = X
dfc['price']=df['price']
data = dice_ml.Data(dataframe=dfc, continuous_features=['airline', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'duration', 'days_left', 'flight_code', 'flight_no'], outcome_name='price')
dfc = dfc.drop(columns=['price'])
lgbm=vot.fit(dfc,y)
model = dice_ml.Model(model=lgbm, backend='sklearn', model_type='regressor')
expander = dice_ml.Dice(data, model)

  y = column_or_1d(y, warn=True)


In [100]:
features = {
  'airline':1,
  'source_city':3,
  'departure_time':3,
  'stops':2,
  'arrival_time':5,
  'destination_city':4,
  'duration':1.95,
  'days_left':19,
  'flight_code':1,
  'flight_no':669
  }
features_df  = pd.DataFrame([features])

In [101]:

counterfactuals = expander.generate_counterfactuals(features_df, total_CFs=3, desired_range=(0, 10000))
counterfactuals.visualize_as_dataframe()

100%|██████████| 1/1 [00:02<00:00,  2.14s/it]

Query instance (original outcome : 28766)





Unnamed: 0,airline,source_city,departure_time,stops,arrival_time,destination_city,duration,days_left,flight_code,flight_no,price
0,1,3,3,2,5,4,1.95,19,1,669,28766.0



Diverse Counterfactual set (new outcome: (0, 10000))


Unnamed: 0,airline,source_city,departure_time,stops,arrival_time,destination_city,duration,days_left,flight_code,flight_no,price
0,1.0,3.0,3.0,0.0,5.0,4.0,1.95,19.0,1.0,1784.0,4180.167969
1,1.0,3.0,3.0,1.0,5.0,4.0,1.95,19.0,1.0,3416.0,5896.491699
2,1.0,3.0,3.0,1.0,5.0,4.0,1.95,19.0,1.0,2173.0,6285.244629
