In [1]:
import importlib
import matplotlib.pyplot as mp
import numpy as np

import torch
import torch.nn as nn
SEED=42
torch.manual_seed(SEED)


from sklearn.tree import export_graphviz
import graphviz
%matplotlib inline
import torch.optim as optim
import torch.nn.functional as F

In [2]:
import pandas as pd

In [3]:
def get_data(filename, comma):
    
    if comma:
        df = pd.read_csv(filename)
    else:
        df = pd.read_csv(filename, sep=';')
    return df

In [4]:
df = get_data('cleanedData.csv', True)


In [5]:
#df.drop("Unnamed: 0", axis=1, inplace=True)

In [6]:
df.columns

Index(['MaxExternalBookingID', 'ChannelID', 'NumberOfPassengers', 'TripType',
       'TreatmentRank', 'TotalFare', 'TreatmentProductSequence',
       'TreatmentRanked', 'ProductID', 'Amount',
       ...
       'SegmentDestinationLocationCode_XNA',
       'SegmentDestinationLocationCode_YYC', 'Role_Type_CHECKIN',
       'Role_Type_INITIAL', 'Role_Type_MANAGE', 'Role_Type_RESERVATION',
       'Price_Type_CONTROL', 'Price_Type_DISCOUNT', 'Price_Type_PREMIUM',
       'Booked'],
      dtype='object', length=494)

# Prediction

In [7]:
train=get_data('trainSet.csv', True)
test=get_data('testSet.csv', True)


In [8]:
y_train=train.pop('Booked').values
X_train=train.values

y_test=test.pop('Booked').values
X_test=test.values



In [9]:
from collections import Counter

In [10]:
X_train.shape

(7845, 493)

In [11]:
X_test.shape

(1962, 493)

In [12]:
#take out all prices but OriginalPrice index from xtest
priceFeat=['TotalFare','Amount']
removeInd=[df.columns.get_loc(c) for c in priceFeat]
X_train=np.delete(X_train, removeInd, 1)
X_test=np.delete(X_test, removeInd, 1)
df.drop(priceFeat, inplace=True, axis=1)

In [13]:
dfOriginalPriceIndex=df.columns.get_loc("OriginalPrice")


In [14]:
#scale the values
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(X_train)
X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)

In [15]:
#get the pricing range
maxPrice=pd.DataFrame(data=scaler.inverse_transform(X_train))[dfOriginalPriceIndex].max()
minPrice=pd.DataFrame(data=scaler.inverse_transform(X_train))[dfOriginalPriceIndex].min()
    

In [16]:
#get all the prices customers were offered in the test set
all_prices=pd.DataFrame(data=scaler.inverse_transform(X_test))[dfOriginalPriceIndex]

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.tree import DecisionTreeClassifier


In [18]:
#pip install pygame

In [19]:
from torch.utils.data import TensorDataset, DataLoader

## SMOTE => Bias

In [20]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler

In [21]:
sm = SMOTE(random_state=42, sampling_strategy=1)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

In [22]:
y_train[y_train==1].shape[0]
#y_train[y_train==0].shape[0]

1005

In [23]:
y_train_res[y_train_res==1].shape[0]
y_train_res[y_train_res==0].shape[0]

6840

In [24]:
#split train set into training and validation set
X_train_res, X_valid, y_train_res, y_valid = train_test_split(X_train_res, y_train_res, test_size=0.25, random_state=1, stratify=y_train_res) 

In [25]:
X_train_new = torch.from_numpy(X_train_res).float()
y_train_new = torch.squeeze(torch.from_numpy(y_train_res).float())
X_valid_new = torch.from_numpy(X_valid).float()
y_valid_new = torch.squeeze(torch.from_numpy(y_valid).float())
X_test_new = torch.from_numpy(X_test).float()
y_test_new = torch.squeeze(torch.from_numpy(y_test).float())

In [26]:
print(X_train_new.shape, y_train_new.shape)
print(X_test_new.shape, y_test_new.shape)

torch.Size([10260, 491]) torch.Size([10260])
torch.Size([1962, 491]) torch.Size([1962])


In [27]:
n_input_dim=X_train_new.shape[1] #= the number of features
n_hidden =8
n_output = 1

In [28]:
#network architecture
class Net(nn.Module):
    def __init__(self, n_input_dim):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(n_input_dim, 64)
        self.fc3 = nn.Linear(64, 1)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        return torch.sigmoid(self.fc3(x))

In [29]:
net=Net(n_input_dim)
#print(net)

In [30]:
#define the hyperparameters
optimizer = optim.Adam(net.parameters(), lr=0.001)
criterion = nn.BCELoss()

In [31]:
from sklearn import metrics

In [32]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
X_train_new = X_train_new.to(device)
y_train_new = y_train_new.to(device)
X_valid_new = X_valid_new.to(device)
y_valid_new = y_valid_new.to(device)
X_test_new = X_test_new.to(device)
y_test_new = y_test_new.to(device)
net = net.to(device)
criterion = criterion.to(device)

In [33]:
y_train_new[y_train_new==0].shape

torch.Size([5130])

In [34]:
def calculate_accuracy(y_true, y_pred):
    predicted = y_pred.ge(.5).view(-1)
    return (y_true == predicted).sum().float() / len(y_true)

In [35]:
def round_tensor(t, decimal_places=3):
    return round(t.item(), decimal_places)

In [36]:
EPOCHS = 900
total_train_acc=[]
total_test_acc=[]
for epoch in range(EPOCHS):
    y_pred =torch.squeeze(net(X_train_new))
   
    #y_pred=net(X_train_new)
    #print(y_pred.shape)
    train_loss = criterion(y_pred, y_train_new)
    
    if epoch % 100 == 0:
        train_acc = calculate_accuracy(y_train_new, y_pred)
        #train_acc = metrics.roc_auc_score(y_train_new.detach().numpy(), y_pred.detach().numpy())
        total_train_acc.append(train_acc)
        y_valid_pred = net(X_valid_new)
        
        y_valid_pred = torch.squeeze(y_valid_pred)
        #print(y_test_pred)
        test_loss = criterion(y_valid_pred, y_valid_new)
        
        
        print(type(train_loss))
        test_acc = calculate_accuracy(y_valid_new, y_valid_pred)
        #test_acc = metrics.roc_auc_score(y_valid_new.detach().numpy(), y_valid_pred.detach().numpy())
        total_test_acc.append(test_acc)
        print(type(test_acc))
        print(f'''epoch {epoch}
Train set - loss: {round_tensor(train_loss)}, accuracy: {round_tensor(train_acc)}
Test  set - loss: {round_tensor(test_loss)}, accuracy: {round_tensor(test_acc)}
''')
        
    optimizer.zero_grad()
    
    train_loss.backward()
    
    optimizer.step()

<class 'torch.Tensor'>
<class 'torch.Tensor'>
epoch 0
Train set - loss: 0.694, accuracy: 0.5
Test  set - loss: 0.694, accuracy: 0.5

<class 'torch.Tensor'>
<class 'torch.Tensor'>
epoch 100
Train set - loss: 0.541, accuracy: 0.727
Test  set - loss: 0.564, accuracy: 0.705

<class 'torch.Tensor'>
<class 'torch.Tensor'>
epoch 200
Train set - loss: 0.429, accuracy: 0.815
Test  set - loss: 0.479, accuracy: 0.769

<class 'torch.Tensor'>
<class 'torch.Tensor'>
epoch 300
Train set - loss: 0.31, accuracy: 0.897
Test  set - loss: 0.39, accuracy: 0.844

<class 'torch.Tensor'>
<class 'torch.Tensor'>
epoch 400
Train set - loss: 0.22, accuracy: 0.94
Test  set - loss: 0.326, accuracy: 0.881

<class 'torch.Tensor'>
<class 'torch.Tensor'>
epoch 500
Train set - loss: 0.16, accuracy: 0.96
Test  set - loss: 0.285, accuracy: 0.898

<class 'torch.Tensor'>
<class 'torch.Tensor'>
epoch 600
Train set - loss: 0.119, accuracy: 0.974
Test  set - loss: 0.259, accuracy: 0.909

<class 'torch.Tensor'>
<class 'torch.Te

In [37]:
import matplotlib.pyplot as plt

In [38]:
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score


In [39]:
y_pred =torch.squeeze(net(X_test_new))

In [40]:
auc_lr = roc_auc_score(y_test_new.detach().numpy(),y_pred.detach().numpy())
print(round(auc_lr,2)) #.64

#fpr_lr, tpr_lr, thresholds_lr = roc_curve(y_test_new, probs_lr)

0.64


# Evaluation - Revenue Generation

In [41]:
all_prices.value_counts()

30.0    934
25.0    777
35.0     95
28.0     57
38.0     47
40.0     37
50.0     12
45.0      3
Name: 8, dtype: int64

In [42]:
#scale each price point inputted using min max scaler
def scalePrice(X):
    y = (X - minPrice) / (maxPrice - minPrice)
    return y

In [43]:
#inverse transform of scalePrice
def deScale(y):
    x=(y*(maxPrice-minPrice)) + minPrice  
    return x

In [44]:
prices = [*range(25,51)]
scaleprices = scalePrice(np.array(prices))

In [45]:
#outputs the recommended price for each test instance
def choosePrice(current_instance):
    chosenPrice=25
    highestPred =0
    
    
    XX = np.zeros((len(prices),current_instance.shape[1]));
    XX[0:len(prices),:] = current_instance
    XX[0:len(prices),dfOriginalPriceIndex] = scaleprices
    prediction=torch.squeeze(torch.from_numpy(XX).float())
    indx = np.argmax(prediction)
    
    
    return XX[indx,:]

In [46]:
newArray=np.zeros((X_test.shape[0],X_test.shape[1]))

In [47]:
for i in range(X_test.shape[0]):
    newArray[i]=choosePrice(X_test[i].reshape(1,-1))

1962


In [48]:
#store all recommended prices outputted by the pricing model
predictedPrice=[] 
for i in range(X_test.shape[0]):
    price= deScale(newArray[i][dfOriginalPriceIndex])
   # print(price)
    predictedPrice.append(price)

In [49]:
predPrice=pd.Series(predictedPrice,name='predPrice')
predPrice.value_counts()

32.0    836
30.0    784
26.0    293
36.0     23
35.0     21
25.0      3
27.0      2
Name: predPrice, dtype: int64

In [50]:
prices = [*range(25,51)]
#get the number of times a customer purchased a bag at each price
accepts = np.zeros(np.size(prices,0))
counts = np.zeros(np.size(prices,0))
all_prices = round(all_prices)
i=0;
for p in prices:
    counts[i] = np.sum(all_prices==p)
    accepts[i]=np.sum(y_test[all_prices==p])
    i = i+1

In [51]:
def smoothcounts(counts,accepts):
    #the number of test instances
    N = np.sum(counts,0)
    rejects = counts - accepts
    
    #for each price count the number of acceptances for that price & all prices lower
    cuma = np.flip(np.cumsum(np.flip(accepts)))
    cumr = np.cumsum(rejects)
    
    cumm = cuma + cumr
    
    #the number of accepts asociated with a price / number of times it was offered
    rawprob = accepts/(counts+(counts==0))
    print("rawprob is",rawprob)  
    
    alpha = rawprob
   
   
    #taking into account the monotonicity in a customer's willingness to pay
    prob2 = (cuma + (N-cumm)*alpha)/N   
    print("prob2 is",prob2)
    return rawprob,prob2
    

In [52]:
rawprob,prob2=smoothcounts(counts,accepts)

rawprob is [0.08751609 0.         0.         0.05263158 0.         0.21413276
 0.         0.         0.         0.         0.05263158 0.
 0.         0.04255319 0.         0.05405405 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.        ]
prob2 is [0.18611272 0.10805301 0.10805301 0.13452975 0.10652396 0.13446381
 0.00458716 0.00458716 0.00458716 0.00458716 0.01440528 0.00203874
 0.00203874 0.00910925 0.00101937 0.00909166 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.        ]


In [53]:
#get the purchasing probability for each price point
priceProbs = dict()
i=0
for p in range(25,51):
    priceProbs[p] = prob2[i]
    i=i+1
priceProbs

{25: 0.18611271822280995,
 26: 0.10805300713557595,
 27: 0.10805300713557595,
 28: 0.13452974945007778,
 29: 0.10652395514780835,
 30: 0.13446380588788698,
 31: 0.0045871559633027525,
 32: 0.0045871559633027525,
 33: 0.0045871559633027525,
 34: 0.0045871559633027525,
 35: 0.014405279253178818,
 36: 0.0020387359836901123,
 37: 0.0020387359836901123,
 38: 0.009109245884572842,
 39: 0.0010193679918450561,
 40: 0.009091660467807258,
 41: 0.0,
 42: 0.0,
 43: 0.0,
 44: 0.0,
 45: 0.0,
 46: 0.0,
 47: 0.0,
 48: 0.0,
 49: 0.0,
 50: 0.0}

In [54]:
#get the revenue generated by the pricing model
revenue = 0
for p in predPrice:
    revenue = revenue + p*priceProbs[p]

In [55]:
revenue

4140.521389069904