In [None]:
import importlib
import matplotlib.pyplot as mp
import numpy as np

%matplotlib inline
import datetime

from sklearn.decomposition import PCA

from sklearn import tree
from sklearn.tree import export_graphviz
import graphviz



In [None]:
import pandas as pd

In [None]:
def get_data(filename, comma):
    
    if comma:
        df = pd.read_csv(filename)
    else:
        df = pd.read_csv(filename, sep=';')
    return df

In [None]:
df = get_data('cleanedData.csv', True)


In [None]:
df2 = get_data('cleanedData.csv', True)

# Prediction

In [None]:
train=get_data('trainSet.csv', True)

In [None]:
test=get_data('testSet.csv', True)


In [None]:
y_train=train.pop('Booked').values
X_train=train.values

y_test=test.pop('Booked').values
X_test=test.values



In [None]:
#scale the data
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(X_train)
X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)

In [None]:
from collections import Counter

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier


In [None]:
print("Minority in training set : %d" % (Counter(y_train)[1]))
print("Minority in test set : %d" % (Counter(y_test)[1]))


In [None]:
models= {}
models['dtree'] =  DecisionTreeClassifier(criterion='entropy')
models['logistic'] = LogisticRegression(max_iter=10000, 
                                        #solver = 'sag', 
                                        random_state=0)


In [None]:
from functools import reduce

In [None]:
#reps=100
reps=2
for model in models:
    xv=[]
    xv_auc=[]
    for i in range(reps):
        kf = KFold(n_splits=10, shuffle = True) 
        scores = cross_val_score(models[model], X_train, y_train, cv=kf)
        xv.append(scores.mean())
        scores_lg = cross_val_score(models[model], X_train, y_train, scoring='roc_auc',cv=kf)
        xv_auc.append(scores_lg.mean())
    avg_acc=reduce(lambda a, b: a + b, xv) / len(xv)
    avg_auc=reduce(lambda a, b: a + b, xv_auc) / len(xv_auc)
    print("{:22} Avg. Accuracy: {:.2f} Avg. AUC {:.2f}".format(type(models[model]).__name__,avg_acc, avg_auc)) 
            

# Testing for Bias

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer

def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]
def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]
def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]
def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]
scoring = {'tp' : make_scorer(tp), 'tn' : make_scorer(tn),
           'fp' : make_scorer(fp), 'fn' : make_scorer(fn)}



In [None]:
bias_cv = {}
folds=10
print("Minority in data set : %d" % Counter(y_train)[1])
for m in models:
    cv_results = cross_validate(models[m], X_train, y_train, cv= folds,scoring=scoring, return_train_score=False, 
                                    verbose = 0, n_jobs = -1)
    n_tot = cv_results['test_tn'].sum() + cv_results['test_fn'].sum()
    acc = (cv_results['test_tp'].sum() + cv_results['test_tn'].sum())/len(y_train)
    bias_cv[m] = n_tot
  
    print("{} x CV {:22} No. of bags in dataset: {:d} Pred no. bags: {:d} Acc: {:.2f}".format(folds, 
                                                                  type(models[m]).__name__, 
                                                                  Counter(y_train)[1], 
                                                                  n_tot,acc)) 


- Logistic regression is accurate but very bias predicitng only 27 bags being bought

# Rectifying Bias

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler

In [None]:
#create pipeline w/ SMOTE and a classifier
steps_all = {}
smt=SMOTE(random_state=0)
steps_all['dtree']=[('smt', smt), 
                    ('dtree', models['dtree'])]
steps_all['logistic']=[('smt', smt), 
                      ('LR', models['logistic'])]

In [None]:
#to check the effect of smote

for m in steps_all:
    pipeline=Pipeline(steps_all[m])
    xv_acc = []
    xv_auc=[]
    bias_l=[]
    
    #running the cross validation multiple times
    for i in range(reps):
        kf = KFold(n_splits=10,shuffle = True)
        
        #testing the bias
        cv_results = cross_validate(pipeline, X_train, y_train, cv= kf,scoring=scoring, return_train_score=False, 
                                    verbose = 0, n_jobs = -1)
        n_tot = cv_results['test_tn'].sum() + cv_results['test_fn'].sum()
        bias_l.append(n_tot)
       
        scores_lg = cross_val_score(pipeline, X_train, y_train,cv=kf)
        xv_acc.append(scores_lg.mean())
        scores_lg = cross_val_score(pipeline, X_train, y_train, scoring='roc_auc',cv=kf)
        xv_auc.append(scores_lg.mean())
    avg_bags=reduce(lambda a, b: a + b, bias_l) / len(bias_l)
    avg_acc=reduce(lambda a, b: a + b, xv_acc) / len(xv_acc)
    avg_auc=reduce(lambda a, b: a + b, xv_auc) / len(xv_auc)
    
    print("{:22} Avg. Pred bag is booked: {:.2f} Avg. Accuracy: {:.2f} Avg. AUC: {:.2f}".format(type(models[m]).__name__, 
                                                              avg_bags,avg_acc, avg_auc))
   

# Decision Tree Feature Selection

In [None]:
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.feature_selection import RFE

In [None]:
xv=[]
xv_auc=[]
for i in range(reps):
    kf = KFold(n_splits=10, shuffle = True) 
    scores = cross_val_score(Pipeline(steps_all['dtree']), X_train, y_train, cv=kf)
    xv.append(scores.mean())
    scores_lg = cross_val_score(Pipeline(steps_all['dtree']), X_train, y_train, scoring='roc_auc',cv=kf)
    xv_auc.append(scores_lg.mean())
avg_acc=reduce(lambda a, b: a + b, xv) / len(xv)
avg_auc=reduce(lambda a, b: a + b, xv_auc) / len(xv_auc)
print("{:22} Avg. Accuracy: {:.2f} Avg AUC: {:.2f}".format(type(models['dtree']).__name__,avg_acc, avg_auc)) 
              

In [None]:
from sklearn import metrics

In [None]:
#get the mutual information of each feature
i_scores = mutual_info_classif(X_train,y_train,random_state=42)
i_scores

In [None]:
mi=dict()
for i,j in zip(df.columns,i_scores):
    mi[i]=j
df_feat=pd.DataFrame.from_dict(mi,orient='index',columns=['I-Gain'])

#only considering the top 50 features
df_feat=df_feat.sort_values(by=['I-Gain'],ascending=False)[0:50]
df_feat.head(10)

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold

In [None]:
#evaluates each feature subset
def evaluate_model(model, X, y, scoring):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring=scoring, cv=cv)
    return scores

In [None]:
#calculates the mean of the scores
def getMean(scores):
    sum=0
    N=len(scores)
    
    for i in scores:
        sum+=i
    
    avg=sum/N
    return avg

In [None]:
auc_scores = []
acc_scores=[]
for kk in range(1, df_feat.shape[0]+1):
    #increases the number of features at each iteration
    fs = SelectKBest(mutual_info_classif, 
                           k=kk)
  
    pipeline_feat=Pipeline(steps=[('anova', fs), 
                                ('dtree', Pipeline(steps_all['dtree']))] )
    #calculate the auc score for each subset
    auc=evaluate_model(pipeline_feat, X_train, y_train, 'roc_auc')
    auc_scores.append(auc)
                           
    #acc=evaluate_model(pipeline_feat, X_train, y_train, 'accuracy')
    #acc_scores.append(acc)

df_feat['AUC']=[getMean(auc_scores[i]) for i in range(len(auc_scores))]
#df_feat['Accuracy']=[getMean(acc_scores[i]) for i in range(len(acc_scores))]

In [None]:
#gets the index of the highest AUC score
finalFeat=df_feat['AUC'][df_feat['AUC']==df_feat['AUC'].max()].index[0]
finalFeat

In [None]:
#gets the names subset of features that produced the highest auc scores
select_feat=list(df_feat[:finalFeat].index)
select_feat

In [None]:
auc_feat=[df.columns.get_loc(c) for c in select_feat]

In [None]:
X_aft=X_train[:, auc_feat]

In [None]:
select_feat
'''
result:
select_feat=['MaxExternalBookingID','TreatmentProductSequence','ProductID',
        'TotalFare','OriginalPrice','SegmentDestinationLocationCode_BIS','FareClass_M']
'''

In [None]:
xv=[]
xv_auc=[]
for i in range(10):
    kf = KFold(n_splits=10, shuffle = True) 
    scores = cross_val_score(Pipeline(steps_all['dtree']), X_aft, y_train, cv=kf)
    xv.append(scores.mean())
    scores_lg = cross_val_score(Pipeline(steps_all['dtree']), X_aft, y_train, scoring='roc_auc',cv=kf)
    xv_auc.append(scores_lg.mean())
avg_acc=reduce(lambda a, b: a + b, xv) / len(xv)
avg_auc=reduce(lambda a, b: a + b, xv_auc) / len(xv_auc)
print("{:22} Avg. Accuracy: {:.2f} Avg AUC: {:.2f}".format(type(models['dtree']).__name__,avg_acc, avg_auc)) 
              

# Classifier Evaluation

## AUC


In [None]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

In [None]:
clf=Pipeline(steps_all['logistic'])
clf.fit(X_aft, y_train)
X_test_aft=X_test[:,auc_feat]
probs_lr=clf.predict(X_test_aft)
auc_lr = roc_auc_score(y_test, probs_lr)
print(round(auc_lr,2)) #.64

fpr_lr, tpr_lr, thresholds_lr = roc_curve(y_test, probs_lr)

In [None]:
clf=Pipeline(steps_all['dtree'])
clf.fit(X_aft, y_train)
X_test_aft=X_test[:,auc_feat]
probs_dt=clf.predict(X_test_aft)
auc_dt = roc_auc_score(y_test, probs_dt)
print(round(auc_dt,2)) #.71

fpr_dt, tpr_dt, thresholds_dt = roc_curve(y_test, probs_dt)



In [None]:
#plot the roc curve for decision tree & logistic regression
plt.figure(figsize=(12, 7))
plt.plot(fpr_lr, tpr_lr, label=f'AUC (Logistic Regression) = {auc_lr:.2f}')
plt.plot(fpr_dt, tpr_dt, label=f'AUC (Decision Tree) = {auc_dt:.2f}')
plt.plot([0, 1], [0, 1], color='blue', linestyle='--', label='Baseline')
plt.title('ROC Curve', size=20)
plt.xlabel('False Positive Rate', size=14)
plt.ylabel('True Positive Rate', size=14)
plt.legend();

# Price Mapping

In [None]:
clf=Pipeline(steps_all['dtree'])

#only using one price feature
select_feat.remove('TotalFare')
originalPriceIndex=select_feat.index('OriginalPrice')

#using the features chosen in feature selection
auc_feat=[df.columns.get_loc(c) for c in select_feat]
clf.fit(X_train[:,auc_feat], y_train)
X_test_aft=X_test[:,auc_feat]

In [None]:
dfOriginalPriceIndex=df.columns.get_loc("OriginalPrice")

In [None]:
#get all prices offered in the test set
all_prices=pd.DataFrame(data=scaler.inverse_transform(X_test))[dfOriginalPriceIndex]
#get the min and max price in the test set
maxPrice=pd.DataFrame(data=scaler.inverse_transform(X_train))[dfOriginalPriceIndex].max()
minPrice=pd.DataFrame(data=scaler.inverse_transform(X_train))[dfOriginalPriceIndex].min()
 

In [None]:
#scale each price point inputted using min max scaler
def scalePrice(X):
    y = (X - minPrice) / (maxPrice - minPrice)
    return y

In [None]:
#inverse transform of scalePrice
def deScale(y):
    x=(y*(maxPrice-minPrice)) + minPrice  
    return x

In [None]:
prices = [*range(25,51)]
scaleprices = scalePrice(np.array(prices))

In [None]:
#outputs the recommended price for each test instance
def choosePrice(current_instance):
    chosenPrice=25
    highestPred =0

    XX = np.zeros((len(prices),current_instance.shape[1]));
    XX[0:len(prices),:] = current_instance
    XX[0:len(prices),originalPriceIndex] = scaleprices
    prediction=clf.predict_proba(XX)[:,1]
    indx = np.argmax(prediction)
    
    
    return XX[indx,:]

In [None]:
newArray=np.zeros((X_test.shape[0],len(select_feat)))

In [None]:
for i in range(X_test.shape[0]):
    newArray[i]=choosePrice(X_test_aft[i].reshape(1,-1))

In [None]:
#store all recommended prices outputted by the pricing model
predictedPrice=[] 
for i in range(X_test.shape[0]):
    price= deScale(newArray[i][originalPriceIndex])
   # print(price)
    predictedPrice.append(price)

In [None]:
predPrice=pd.Series(predictedPrice,name='predPrice')

# Evaluation - Revenue Generation

## Test Set Revenue
Get the actual revenue generated by the test set

In [None]:
testResults=pd.DataFrame()
testResults['TestPrices']=all_prices
testResults['y_test']=y_test

In [None]:
y_booked=testResults[testResults.y_test==1]#.shape
testRevenue=y_booked.TestPrices.sum()
testRevenue

## Predicted Revenue from Pricing Model

In [None]:
accepts = np.zeros(np.size(prices,0))
counts = np.zeros(np.size(prices,0))
all_prices = round(all_prices)
i=0;
for p in prices:
    counts[i] = np.sum(all_prices==p)
    accepts[i]=np.sum(y_test[all_prices==p])
    i = i+1

In [None]:
def smoothcounts(counts,accepts):
    #the number of test instances
    N = np.sum(counts,0)
    rejects = counts - accepts
    
    #for each price count the number of acceptances for that price & all prices lower
    cuma = np.flip(np.cumsum(np.flip(accepts)))
    cumr = np.cumsum(rejects)
    
    cumm = cuma + cumr
    
    #the number of accepts asociated with a price / number of times it was offered
    rawprob = accepts/(counts+(counts==0))
    print("rawprob is",rawprob)  
    
    alpha = rawprob
   
   
    #taking into account the monotonicity in a customer's willingness to pay
    prob2 = (cuma + (N-cumm)*alpha)/N   
    print("prob2 is",prob2)
    return rawprob,prob2

In [None]:
rawprob,prob2=smoothcounts(counts,accepts)

In [None]:
#get the purchasing probability for each price point
priceProbs = dict()
i=0
for p in range(25,51):
    priceProbs[p] = prob2[i]
    i=i+1
priceProbs

In [None]:
#get the revenue generated by the pricing model
revenue = 0
for p in predPrice:
    revenue = revenue + p*priceProbs[p]
revenue