#Bundling of Product on the basis of Feature Extracted from the product details and the co-purchases made

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import re
from pprint import pprint
import networkx as nx
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
!pip install gdown



In [4]:
!gdown --id 1-MyYRp_5NnmU1JWvl1jV6Dhe3o2HAjdc

Downloading...
From: https://drive.google.com/uc?id=1-MyYRp_5NnmU1JWvl1jV6Dhe3o2HAjdc
To: /content/amazon-meta.txt
100% 978M/978M [00:13<00:00, 71.7MB/s]


In [5]:
!gdown --id 1XwBFgm_vdUTCe6LUkqSSNJanVOpx7OGP

Downloading...
From: https://drive.google.com/uc?id=1XwBFgm_vdUTCe6LUkqSSNJanVOpx7OGP
To: /content/amazon0601.txt
100% 47.9M/47.9M [00:00<00:00, 77.3MB/s]


### Creating a Directed graph from the given edgelist

In [24]:
Graphtype = nx.DiGraph()
filepath = '/content/amazon0601.txt'
G = nx.read_edgelist(
    filepath,
    comments='#',
    create_using=Graphtype,
    nodetype=int
)

#### Reading Metadata

In [25]:
df = pd.read_csv('/content/drive/MyDrive/GNN_datasets/finalpreprocesseddata.csv')

In [26]:
df.head()

Unnamed: 0,ID,ASIN,title,group,salesrank,num_categories,totalreviews,downloadedreviews,avg_rating
0,1,827229534,Patterns of Preaching,Book,396585,2,2,2,5.0
1,2,738700797,Candlemas,Book,168596,2,12,12,4.5
2,3,486287785,World War II Allied Fighter Planes Trading Cards,Book,1270652,1,1,1,5.0
3,4,842328327,Life Application Bible Commentary,Book,631289,5,1,1,4.0
4,5,1577943082,Prayers That Avail Much for Business,Book,455160,2,0,0,0.0


##### Checking and Dropping Null Values

In [27]:
df.isnull().sum()
df = df.dropna()

In [28]:
df['group'].value_counts()

Book            393559
Music           103144
Video            26131
DVD              19828
Toy                  8
Software             5
CE                   4
Video Games          1
Baby Product         1
Sports               1
Name: group, dtype: int64

#####Indexing the graph on the attribute 'ID'

In [29]:
df = df.set_index('ID')

#### Selecting Attributes that each nodes will have

In [30]:
nodeattrdf = df[['ASIN','title','group', 'num_categories','salesrank', 'totalreviews', 'downloadedreviews', 'avg_rating']].copy()

#### Combining the transaction data and the metadata into a networkX Directed Graph

In [31]:
nx.set_node_attributes(G, nodeattrdf.to_dict('index'))

In [32]:
G.nodes[8]

{'ASIN': '0231118597',
 'title': 'Losing Matt Shepard',
 'group': 'Book',
 'num_categories': 4,
 'salesrank': 277409,
 'totalreviews': 15,
 'downloadedreviews': 15,
 'avg_rating': 4.5}

####**Empty nodes/Disjoint nodes do not contribute significantly when we recommended products.So we have removed them**

In [33]:
remove = [nodes for nodes in dict(G.nodes()) if len(G.nodes[nodes]) == 0]

In [34]:
G.remove_nodes_from(remove)

Here, n is the array of all non-deleted nodes in the graph

In [35]:
n = list(G.nodes)

In [36]:
n =  np.array(n)
n = np.unique(n)

##### Helper function to retrive all target nodes from the edges starting from the source

In [37]:
def getclean(pro_id):
    l = str(list(G.edges(pro_id)))
    l = l.replace('[','')
    l = l.replace(']','')
    l = l.replace(',','')
    l = l.replace('(','')
    l = l.replace(')','')
    l = l.replace(str(pro_id)+" ", '')
    b = l.split()
    b = np.array(b)
    b = b.astype(int)
    return b

In [38]:
G.edges(5)

OutEdgeDataView([(5, 6), (5, 44), (5, 46), (5, 47), (5, 48), (5, 49), (5, 50), (5, 51), (5, 52), (5, 53)])

In [39]:
getclean(5)

array([ 6, 44, 46, 47, 48, 49, 50, 51, 52, 53])

### Adding edge weights

In [40]:
def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(set(list1)) + len(set(list2))) - intersection
    return float(intersection) / union

def gethighestjaccard(pro_id, finalresult):
  jaccdict = {}
  for i in range(0,len(finalresult)):
    if(pro_id != i):
      tempneighbours = []
      resarray = getclean(finalresult[i])
      jaccdict[finalresult[i]] = jaccard_similarity(finalresult, resarray)
  return jaccdict

for i in n:
  pro_id = i
  finalresult = []
  resultarray = np.unique(getclean(pro_id))
  for i in range(1, len(resultarray)):
      if(resultarray[i] in n):
          finalresult.append(resultarray[i])

  finaldictjaccard = gethighestjaccard(pro_id, finalresult)
  finaldictjaccard = dict(sorted(finaldictjaccard.items(), key=lambda item: item[1], reverse = True))

  for key in finaldictjaccard.keys():
    G.add_edge(pro_id, key, weight= finaldictjaccard[key])


### Saving graph into json file

In [139]:
from networkx.readwrite import json_graph
import json

graph_json = json_graph.node_link_data(G)

with open("/content/graph_with_weights.json", "w") as outfile:
    json.dump(graph_json, outfile, default = str)

### Non parameterized_beam_search

In [130]:
import networkx as nx

def beam_search(graph, start_node, beam_width, max_depth ):
    # Initialize the beam with the starting node and its weight as a path
    beam = [([start_node], 0)]
    best_paths = []

    # Perform beam search until the maximum depth is reached
    for depth in range(max_depth):
        next_beam = []
        for path, path_weight in beam:
            current_node = path[-1]
            # Get the outgoing edges and their weights for the current node
            outgoing_edges = graph.out_edges(current_node, data=True)
            for edge in outgoing_edges:
                if len(edge[2]) == 0:
                  continue
                to_node, edge_weight = edge[1], edge[2]['weight']
                new_path = path + [to_node]
                new_weight = path_weight + edge_weight
                next_beam.append((new_path, new_weight))

        # Sort the next_beam based on path weight and select the top beam_width paths
        next_beam.sort(key=lambda x: x[1])
        beam = next_beam[:beam_width]

        best_paths.extend(beam)

    return [path[0] for path in best_paths[-5:]]

In [136]:
beam_search(G, 1, 5, 5)

[[1, 185, 239, 186, 49, '71210'],
 [1, 185, 239, 186, 2516, 7606],
 [1, 185, 239, 186, 2516, 9867],
 [1, 185, 239, 186, 2516, 25429],
 [1, 185, 239, 186, 2518, 49]]