In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
# Importing Important Libraries
import pandas as pd
import numpy as np
import math
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.neighbors import NearestNeighbors

In [3]:
# Loading DataSet
df_train=pd.read_csv("/content/drive/MyDrive/Minor Project 2/Instacart_history (1).csv")

In [6]:
# Mapping values
def encode(df,i):
  item_unique = df[i].unique().tolist()
  item_map = dict(zip(item_unique, range(1,len(item_unique) + 1)))
  item_map[-1] = 0
  df[i] = df[i].apply(lambda x: item_map[x])

In [7]:
# Data Preprocessing

# Renaming Columns
df_train.rename(columns = {'ORDER_NUMBER':'TRANSACTION_DT','MATERIAL_NUMBER':'PRODUCT_ID'}, inplace = True)
# Add "AMOUNT" Column
df_train['AMOUNT']=1
# Sorting Dataset with 'CUSTOMER_ID','TRANSACTION_DT'
df_train=df_train.sort_values(by=['CUSTOMER_ID','TRANSACTION_DT'])
# Storing customer and product count
customer=df_train['CUSTOMER_ID'].unique()
product=df_train['PRODUCT_ID'].unique()
# Selecting Users with 40 Baskets
for i in customer:
  if df_train[df_train['CUSTOMER_ID']==i]['TRANSACTION_DT'].unique().size<40:
    df_train=df_train.drop(df_train[df_train['CUSTOMER_ID'] == i].index)
# Grouring dataset in order 'CUSTOMER_ID', 'TRANSACTION_DT', 'PRODUCT_ID'
grouped_df=df_train.groupby(['CUSTOMER_ID', 'TRANSACTION_DT', 'PRODUCT_ID'])['AMOUNT'].sum().reset_index()
# Pivoting Dataset
pivot_df = grouped_df.pivot(index=['CUSTOMER_ID', 'TRANSACTION_DT'], columns='PRODUCT_ID', values='AMOUNT').fillna(0)
# Resting Index
pivot_df.reset_index(inplace=True)
pivot_df.columns.name = None
pivot_df

Unnamed: 0,CUSTOMER_ID,TRANSACTION_DT,1,2,3,4,5,6,7,8,...,7984,7986,7989,7991,7992,7993,7994,7995,7998,7999
0,27,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,27,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,27,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,27,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,27,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13922,19935,46,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13923,19935,47,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13924,19935,48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13925,19935,49,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# Again Storing Customer and Product count
customer=df_train['CUSTOMER_ID'].unique()
product=df_train['PRODUCT_ID'].unique()

In [9]:
# Dividing Dataset for Training Testing and Validation
train_basket=[]     # Training basket
validate_basket=[]  # Validation basket
test_basket=[]      # Testing basket
actual_basket=[]    # Actual basket
train_lstm=[]       # Training Basket for LSTM
customer=pivot_df['CUSTOMER_ID'].unique()
for i in pivot_df['CUSTOMER_ID'].unique():
  train=[]
  validate=[]
  test=[]
  actual=[]
  lstm=[]
  df=pivot_df[pivot_df['CUSTOMER_ID']==i]
  count=0
  size=df['TRANSACTION_DT'].unique().size
  date=df['TRANSACTION_DT'].unique()
  for j in df['TRANSACTION_DT'].unique():
    if count<size-2:
      df3=df[df['TRANSACTION_DT']==j]
      df3.drop(['CUSTOMER_ID','TRANSACTION_DT'], inplace=True, axis=1)
      arr=df3.to_numpy()
      arr=arr.tolist()
      train.append(arr[0])
      lstm.append(arr[0])

    if count>=1 and count<size-1:
      df3=df[df['TRANSACTION_DT']==j]
      df3.drop(['CUSTOMER_ID','TRANSACTION_DT'], inplace=True, axis=1)
      arr=df3.to_numpy()
      arr=arr.tolist()
      validate.append(arr[0])
      lstm.append(arr[0])

    if count==size-2:
      df3=df[df['TRANSACTION_DT']==j]
      df3.drop(['CUSTOMER_ID','TRANSACTION_DT'], inplace=True, axis=1)
      arr=df3.to_numpy()
      arr=arr.tolist()
      test.append(arr[0])
      lstm.append(arr[0])

    if count==size-1:
      df3=df[df['TRANSACTION_DT']==j]
      df3.drop(['CUSTOMER_ID','TRANSACTION_DT'], inplace=True, axis=1)
      arr=df3.to_numpy()
      arr=arr.tolist()
      actual.append(arr[0])

    count+=1
  validate_basket.append(validate)
  train_basket.append(train)
  test_basket.append(test)
  actual_basket.append(actual)
  train_lstm.append(lstm)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3.drop(['CUSTOMER_ID','TRANSACTION_DT'], inplace=True, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3.drop(['CUSTOMER_ID','TRANSACTION_DT'], inplace=True, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3.drop(['CUSTOMER_ID','TRANSACTION_DT'], inplace=True, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pan

In [12]:
# Padding Dataset with zeros in front
max_len = max(len(sublist) for sublist in train_basket)
print(max_len)
padded_data = [sublist + [[0.0] * len(sublist[0])] * (max_len - len(sublist)) for sublist in train_basket]
train_tensor = torch.tensor(padded_data, dtype=torch.float32)
max_len = max(len(sublist) for sublist in validate_basket)
print(max_len)
padded_data = [sublist + [[0.0] * len(sublist[0])] * (max_len - len(sublist)) for sublist in validate_basket]
validate_tensor = torch.tensor(padded_data, dtype=torch.float32)
test_tensor = torch.tensor(test_basket, dtype=torch.float32)
actual_tensor = torch.tensor(actual_basket, dtype=torch.float32)
max_len_lstm = max(len(sublist) for sublist in train_lstm)
print(max_len_lstm)
padded_data_lstm = [sublist + [[0.0] * len(sublist[0])] * (max_len_lstm - len(sublist)) for sublist in train_lstm]
train_tensor_lstm = torch.tensor(padded_data_lstm, dtype=torch.float32)

48
48
97


# **1st Model UGP using FFNN**

In [None]:
########################### 1st Model UGP using FFNN #####################################################

In [13]:
# Model Initialization
model = nn.Sequential(
    nn.Linear(6204, 128),
    nn.ReLU(),
    nn.Linear(128, 256),
    nn.ReLU(),
    nn.Linear(256, 6204),
    nn.Softmax())
# Loss Function and Optimizer Selection
loss_fn =  nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Model Training
n_epochs = 25
batch_size = 10
X=train_tensor
y=validate_tensor
for epoch in range(n_epochs):
    for i in range(0, len(X), batch_size):
        Xbatch = X[i:i+batch_size]
        y_pred = model(Xbatch)
        ybatch = y[i:i+batch_size]
        loss = loss_fn(y_pred, ybatch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Finished epoch {epoch}, latest loss {loss}')
# Model Prediction
y_pred = model(test_tensor)
y_pred

  return self._call_impl(*args, **kwargs)


Finished epoch 0, latest loss 0.13615328073501587
Finished epoch 1, latest loss 0.13609157502651215
Finished epoch 2, latest loss 0.13600972294807434
Finished epoch 3, latest loss 0.1359211653470993
Finished epoch 4, latest loss 0.1358112394809723
Finished epoch 5, latest loss 0.13567321002483368
Finished epoch 6, latest loss 0.13554027676582336
Finished epoch 7, latest loss 0.13543175160884857
Finished epoch 8, latest loss 0.13536755740642548
Finished epoch 9, latest loss 0.13525913655757904
Finished epoch 10, latest loss 0.1351812332868576
Finished epoch 11, latest loss 0.13509437441825867
Finished epoch 12, latest loss 0.1350305825471878
Finished epoch 13, latest loss 0.13496622443199158
Finished epoch 14, latest loss 0.1349150389432907
Finished epoch 15, latest loss 0.1348605453968048
Finished epoch 16, latest loss 0.1348172128200531
Finished epoch 17, latest loss 0.1347731202840805
Finished epoch 18, latest loss 0.1347363293170929
Finished epoch 19, latest loss 0.1346970498561859


tensor([[[0.0021, 0.0053, 0.0029,  ..., 0.0048, 0.0026, 0.0021]],

        [[0.0023, 0.0015, 0.0022,  ..., 0.0030, 0.0033, 0.0024]],

        [[0.0036, 0.0016, 0.0042,  ..., 0.0032, 0.0031, 0.0039]],

        ...,

        [[0.0019, 0.0030, 0.0028,  ..., 0.0038, 0.0030, 0.0043]],

        [[0.0024, 0.0034, 0.0029,  ..., 0.0035, 0.0031, 0.0037]],

        [[0.0007, 0.0014, 0.0039,  ..., 0.0029, 0.0043, 0.0028]]],
       grad_fn=<SoftmaxBackward0>)

In [14]:
# Converion from 3d to 2d
y_pred_2d = y_pred.view(298, 6204)
y_pred_2d

tensor([[0.0021, 0.0053, 0.0029,  ..., 0.0048, 0.0026, 0.0021],
        [0.0023, 0.0015, 0.0022,  ..., 0.0030, 0.0033, 0.0024],
        [0.0036, 0.0016, 0.0042,  ..., 0.0032, 0.0031, 0.0039],
        ...,
        [0.0019, 0.0030, 0.0028,  ..., 0.0038, 0.0030, 0.0043],
        [0.0024, 0.0034, 0.0029,  ..., 0.0035, 0.0031, 0.0037],
        [0.0007, 0.0014, 0.0039,  ..., 0.0029, 0.0043, 0.0028]],
       grad_fn=<ViewBackward0>)

In [15]:
def find_recall(k):
  top_recommendation=[]
  for i in range(len(actual_basket)):
    top_recommendation.append(y_pred[i][0].topk(k,dim=0).indices)

  rec=0
  for i in range(len(actual_basket)):
    r=0
    x=0
    for j in range(k):
      if(actual_basket[i][0][top_recommendation[i][j]]==1):
        r+=1
    for j in range(len(actual_basket[i][0])):
      if(actual_basket[i][0][j]==1):
        x+=1
    rec=rec+r/x

  recall=rec/len(actual_basket)
  return recall

In [16]:
def find_precision(k):
  top_recommendation=[]
  for i in range(len(actual_basket)):
    top_recommendation.append(y_pred[i][0].topk(k,dim=0).indices)

  prec=0
  for i in range(len(actual_basket)):
    r=0
    for j in range(k):
      if(actual_basket[i][0][top_recommendation[i][j]]==1):
        r+=1
    prec=prec+r/k

  precision=prec/len(actual_basket)
  return precision

In [17]:
print(find_recall(5))
print(find_recall(10))
print(find_recall(15))

0.1738423122575919
0.24121326743309185
0.29826821727862973


In [18]:
print(find_precision(5))
print(find_precision(10))
print(find_precision(15))

0.25436241610738275
0.1872483221476512
0.15861297539149907


In [19]:
#F1 Score
print((2*find_precision(5)*find_recall(5))/(find_recall(5)+find_precision(5)))
print((2*find_precision(10)*find_recall(10))/(find_recall(10)+find_precision(10)))
print((2*find_precision(15)*find_recall(15))/(find_recall(15)+find_precision(15)))

0.20653181825608294
0.21083233925727432
0.2070963312400524


# **2nd Model using LSTM**

In [None]:
 ########################################################### 2nd Model LSTM     ####################################################################

In [20]:
#normal lstm

# Check if GPU is available, and set the device accordingly
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assuming you have your train data loaded as a tensor
# Ensure that train_data has the shape (num_users, sequence_length, input_features)
train_data = train_tensor_lstm

# Define the batch size and other constants
batch_size = 8  # Adjust as needed
sequence_length = train_tensor.shape[1]
input_features = train_tensor.shape[2]
output_features = train_tensor.shape[2]  # Number of features in the predicted sequence (next sequence)

# Create DataLoader for training data
train_dataset = torch.utils.data.TensorDataset(train_data)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Define the LSTM model with the correct input and output size
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        output = self.linear(lstm_out)
        output = torch.softmax(output,dim=1)
        #print(output)
        return output

# Initialize the model and move it to the GPU
model_LSTM = LSTMModel(input_features, 64, 2, output_features)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 20  # Adjust as needed
for epoch in range(num_epochs):
    for batch in train_loader:
        # Move the batch to the GPU
        batch = batch[0]  # Unwrap the batch from the DataLoader

        # Split the input and target sequences
        input_seq = batch[:, :-1, :]  # Input sequence (exclude the last time step)
        target_seq = batch[:, 1:, :]   # Target sequence (exclude the first time step)

        # Forward pass
        train_predictions = model_LSTM(input_seq)

        # Compute loss for this batch
        loss = criterion(train_predictions, target_seq)  # Predicting the next sequence

        # Backpropagation and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Print loss for monitoring at the end of each epoch
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')
# Model Prediction
lstm_res=model_LSTM(test_tensor)

Epoch [1/20], Loss: 0.0012823597062379122
Epoch [2/20], Loss: 0.0019334123935550451
Epoch [3/20], Loss: 0.0014484233688563108
Epoch [4/20], Loss: 0.0010341046145185828
Epoch [5/20], Loss: 0.001958054257556796
Epoch [6/20], Loss: 0.001550351153127849
Epoch [7/20], Loss: 0.0008697062148712575
Epoch [8/20], Loss: 0.0011130458442494273
Epoch [9/20], Loss: 0.0017739285249263048
Epoch [10/20], Loss: 0.0015963895712047815
Epoch [11/20], Loss: 0.0009847863111644983
Epoch [12/20], Loss: 0.0015651537105441093
Epoch [13/20], Loss: 0.0021635624580085278
Epoch [14/20], Loss: 0.0010522030061110854
Epoch [15/20], Loss: 0.0010176688665524125
Epoch [16/20], Loss: 0.0016802283935248852
Epoch [17/20], Loss: 0.0015930919907987118
Epoch [18/20], Loss: 0.0013152408646419644
Epoch [19/20], Loss: 0.0016227171290665865
Epoch [20/20], Loss: 0.0013859329046681523


In [21]:
# Conversion from 3d to 2d
lstm_res_2d = lstm_res.view(298, 6204)

# **3rd Model using KNN**

In [22]:
############################################################# 3rd Model KNN ########################################################################

In [23]:
# Finding K Nearest Neighbours

num_nearest_neighbors = 30

nn_model = NearestNeighbors(n_neighbors = num_nearest_neighbors, metric = 'cosine')

u=y_pred_2d

u = u.detach().numpy()
nn_model.fit(u)

nearest_neighbors_indices = nn_model.kneighbors(u, return_distance=False)

In [24]:
# Mean Polling
knn=[]

for i in range(0,len(customer)):
  temp = [0]*len(product)
  for j in range (0, len(product)):
    for k in range(0,30):
      # print(nearest_neighbors_indices[i][k])
      x=float(y_pred_2d[nearest_neighbors_indices[i][k]][j])
      temp[j]=temp[j]+x
    temp[j]=temp[j]/20
  knn.append(temp)



In [25]:
# Converting to Numpy
y_pred_2d_numpy=y_pred_2d.detach().cpu().numpy()
lstm_res_2d_numpy=lstm_res_2d.detach().cpu().numpy()
knn_numpy=np.array(knn)
actual_basket_numpy=np.array(actual_basket)

# Conversion from 3d to 2d
actual_basket_numpy_2d = actual_basket_numpy.reshape(298,6204)

In [26]:
def find_recall_final(k):
  top_recommendation=[]
  final_prediction_tensor = torch.from_numpy(final_prediction)
  for i in range(len(actual_basket_numpy_2d)):
    top_recommendation.append(final_prediction_tensor[i].topk(k,dim=0).indices)

  rec=0
  for i in range(len(actual_basket_numpy_2d)):
    r=0
    x=0
    for j in range(k):
      if(actual_basket_numpy_2d[i][top_recommendation[i][j]]==1):
        r+=1
    for j in range(len(actual_basket_numpy_2d[i])):
      if(actual_basket_numpy_2d[i][j]==1):
        x+=1
    rec=rec+r/x

  recall=rec/len(actual_basket_numpy_2d)
  return recall

# **Final Evaluation**

In [27]:
# HyperParameter Tuning for alpha
for i in range(1,10):
    alpha = i/10
    final_prediction = alpha*(y_pred_2d_numpy + lstm_res_2d_numpy) + (1-alpha) * (knn_numpy)
    print(alpha)
    print(find_recall_final(5))
    print(find_recall_final(10))
    print(find_recall_final(15))
    print(find_recall_final(20))

0.1
0.15334399925768108
0.21306751924666661
0.25502451513915403
0.2889102368790457
0.2
0.16869532357512626
0.22349728048810516
0.273219656044478
0.30317812948223505
0.3
0.16935878806606808
0.22918506353408266
0.2798036360735517
0.3136488917942196
0.4
0.17143867127011236
0.23469948293273024
0.2830774454471663
0.3191262005881796
0.5
0.17104975183756882
0.23722503810090284
0.2861007038814986
0.3250705452070208
0.6
0.17527644616204144
0.23820251420421784
0.2894016069307237
0.3285241246253653
0.7
0.1752760595601177
0.23794405715045874
0.2954398237866728
0.3302358795640932
0.8
0.17441050080865286
0.2398216538267132
0.29672231724279685
0.3310732077839717
0.9
0.17451731032902595
0.2384197149751099
0.29692593539943507
0.33235157147844285
