In [33]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
import itertools
import sys
sys.path.insert(0, '..')
import fashion.preprocessing as prep
# display all columns
pd.set_option('display.max_columns', None)

# show all results of the notebook not just the last one
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import networkx as nx
from node2vec import Node2Vec

from gensim.models import Word2Vec

import warnings
warnings.filterwarnings('ignore')

# This first set code generates a graph. hyperparameters that need to be in main() are in the box below

In [58]:
shops = prep.load_shops('../data/20200120_filiali.csv', extra_info=False)
sales = prep.load_sales('../data/20200120_sales17.csv', shops)
weeks = 2 #this is the weeks of sales data used in the model 
edgelist_name = 'list_of_edges.csv' #this is what the graph edgelist is saved as


In [59]:
print('adding product and week columns to data')

sales['Product'] = sales.EAN.astype(str).str[:-3]
sales['Week'] = pd.DatetimeIndex(sales.Date.astype(str)).week 

print('done')

adding product and week columns to data
done


In [62]:
print('generating product-2-store edgelist')

Sales_per_store_sums = sales.groupby(['Product', 'StoreKey','Week'])['Volume'].sum() 

first_n_weeks = Sales_per_store_sums.groupby(['Product', 'StoreKey']).head(weeks)
total_sales = first_two_weeks.groupby(['Product', 'StoreKey']).sum()

max_val = total_sales.max()

Product2store_graph = total_sales.reset_index()
Product2store_graph.Volume = Product2store_graph.Volume/max_val

print('done')

#Sales_per_store_means = Sales_per_store_sums.groupby(['Product', 'StoreKey']).mean()

generating product-2-store edgelist
done


In [76]:
print('saving product-2-store edgelist')
#Product2store_graph.to_csv(r'Product2store_edgelist.csv', header=False, index=False) #make it a parameter!!!

Product2store_graph.to_csv(r'{}'.format(edgelist_name), header=False, index=False) #make it a parameter!!!



print('done')

saving product-2-store edgelist
done


# This second set of functions turns the edgelist into embeddings. This should be a separate python file! hyperparameters that need to be in main() are in the box below

In [92]:
edgelist_name = 'list_of_edges.csv' #this is what the graph edgelist is saved as
walk_length = 30 #sets length of the random walks
vector_length = 10 #sets length of output vectors
no_epochs = 20 #sets number of epoch in skip-gram training
model_name = 'DeepWalk_embeddings.model'#what you save the model name to

In [78]:

# Create a graph
print('loading graph from edgelist')

fh=open(edgelist_name, 'r')
graph = nx.read_weighted_edgelist(fh,delimiter=',',nodetype = str)


print('done')


loading graph from edgelist
done


In [80]:
# Create an adjacency matrix

print('creating adjacency matrix')

A = nx.adjacency_matrix(graph)
A_array = A.toarray()

A_cumsum = np.cumsum(A_array,axis=1)
matrix_lengths = A.shape[0]

print('done')

creating adjacency matrix
done


In [81]:
print('creating dictionaries for row probabilities and matrix identities')

row_total_probabilities = [A[x].sum() for x in range(0,matrix_lengths)]
items = list(range(matrix_lengths))

probability_dictionary = {}
for item, probability in zip(items, row_total_probabilities):
    probability_dictionary[item] = probability
    
prod_store_dict = {}
for item2, node in zip(items, graph.nodes()):
    prod_store_dict[item2] = node
    

def transition(x):
    if probability_dictionary[x] == 0:
        return x
    else:
        y = np.random.uniform(high=probability_dictionary[x])
        z = np.argwhere(A_cumsum[x]>y)[0][0]
        return z

print('done')


creating dictionaries for row probabilities and matrix identities
done


In [83]:
#size is how many walks, the range is how many steps in each walk

print('generating random walks - may take a few minutes if long walks are chosen')

random_walks = list(range(matrix_lengths)) + list(range(matrix_lengths))

#change it so you do a walk for every node 
# rather than randomly choosing nodes to walk with 

row = random_walks

for null in range(0,walk_length):
    next_node =  np.asarray([transition(oof) for oof in row])
    random_walks = np.vstack((random_walks, next_node))
    row = random_walks[:][-1]
    
    
    
walk_list = random_walks.T.tolist()

#str_walk_list = [list(map(str, walk)) for walk in walk_list]    
 
str_walk_list = [[prod_store_dict[thing] for thing in walk] for walk in walk_list]   

print('done')

generating random walks - may take a few minutes if long walks are chosen
done


In [85]:
#running Word2Vec

print('running word2vec')

model = Word2Vec(size = vector_length, window = 4, sg = 1, hs = 0,
                 negative = 10, # for negative sampling
                 alpha=0.03, min_alpha=0.0007,
                 seed = 14)


model.build_vocab(str_walk_list, progress_per=2)


model.train(str_walk_list, total_examples = model.corpus_count, epochs=no_epochs, report_delay=1)


print('done')

running word2vec


(11279972, 11431560)

done


In [93]:
print('saving model')
model.save('{}'.format(model_name))
print('done')


saving model
done


In [89]:
test = Word2Vec.load(model_name)

In [91]:
test.most_similar('2025443001')

[('2089434001', 0.9555003046989441),
 ('2087127002', 0.9542059898376465),
 ('2088273004', 0.9537729620933533),
 ('2087847002', 0.949905514717102),
 ('2088979001', 0.9473111033439636),
 ('2094439002', 0.9453172087669373),
 ('2090127001', 0.9433228969573975),
 ('2094639001', 0.9402214288711548),
 ('2089429002', 0.9385861754417419),
 ('2095030001', 0.9385299682617188)]