Copyright (C) 2021 Textualization Software Ltd. Distributed under the terms of the [Apache Software License 2.0](http://www.apache.org/licenses/LICENSE-2.0).
---

<a href="https://colab.research.google.com/github/Textualization/riiaa21_ws11_ml_over_encrypted_data/blob/main/notebooks/4_Palisade_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



In [1]:
!rm -rf riiaa21_ws11_ml_over_encrypted_data
!git clone https://github.com/Textualization/riiaa21_ws11_ml_over_encrypted_data

Cloning into 'riiaa21_ws11_ml_over_encrypted_data'...
remote: Enumerating objects: 83, done.[K
remote: Counting objects: 100% (83/83), done.[K
remote: Compressing objects: 100% (59/59), done.[K
remote: Total 83 (delta 41), reused 63 (delta 23), pack-reused 0[K
Unpacking objects: 100% (83/83), done.


In [2]:
!cp /content/riiaa21_ws11_ml_over_encrypted_data/dependencies/palisade-install.tar.gz /content; cd /content; tar -xzf palisade-install.tar.gz
!cd /content/riiaa21_ws11_ml_over_encrypted_data/data; tar -xzf synthetic-invoices.tar.gz

In [3]:
!mkdir -p /content/riiaa21_ws11_ml_over_encrypted_data/build/lib
!cd /content/riiaa21_ws11_ml_over_encrypted_data; c++ -fopenmp -fPIC -std=gnu++11 `python3.7-config --cflags` `python3.7-config --includes` -I/content/palisade-install/include/palisade{,/core,/pke,/third-party/inclue} -o build/lib/ckks_wrapper.o -c src/ckks_wrapper.cpp 
!cd /content/riiaa21_ws11_ml_over_encrypted_data; c++ -fopenmp -fPIC -std=gnu++11 `python3.7-config --cflags` `python3.7-config --includes` -I/content/palisade-install/include/palisade{,/core,/pke,/third-party/inclue} -o build/lib/pycrypto.o -c src/pycrypto.cpp 
!cd /content/riiaa21_ws11_ml_over_encrypted_data; /usr/bin/c++ -fPIC  -Wall -Werror -O3  `python3.7-config --ldflags` -DPALISADE_VERSION=1.11.4 -Wno-parentheses -fopenmp -shared -Wl,-soname,pycrypto.so.1 -lpython3.7m -o build/lib/pycrypto.so.1 build/lib/ckks_wrapper.o build/lib/pycrypto.o   -L/content/palisade-install/lib  -Wl,-rpath,/content/palisade-install/lib:  /content/palisade-install/lib/libPALISADEcore.so.1 /content/palisade-install/lib/libPALISADEbinfhe.so.1 /content/palisade-install/lib/libPALISADEpke.so.1  /usr/lib/x86_64-linux-gnu/libpython3.7m.so /usr/lib/x86_64-linux-gnu/libboost_python3-py36.so 
!cd /content/riiaa21_ws11_ml_over_encrypted_data/build/lib; ln -s pycrypto.so.1 pycrypto.so

In [4]:
import sys
sys.path.append('/content/riiaa21_ws11_ml_over_encrypted_data/build/lib')

import pycrypto

In [5]:
import os
import os.path
import random
import math

import xml.etree.ElementTree as ET

import numpy as np
import timeit

In [11]:
DATA_DIR='/content/riiaa21_ws11_ml_over_encrypted_data/data/invoices'

matrix           = list() # rows = [ rfc, pref dict id->count ]
clients          = dict() # rfc -> [ row in matrix, name ]
clients_by_row   = list() # row -> [ rfc, name ]
products         = dict() # id -> [ col in matrix, name ]
products_by_col  = list() # col -> [ id, name ]

for invoice in os.listdir(DATA_DIR):
    if '.xml' not in invoice:
        continue
    tree = ET.parse(os.path.join(DATA_DIR, invoice))
    root = tree.getroot()
    rfc = root[1].attrib['Rfc'] # Receptor
    if rfc not in clients:
        # new row
        # FOR PRESENTATION
        if len(matrix) > 400:
          break
        clients[rfc] = [ len(matrix), root[1].attrib['Nombre'] ]
        matrix.append( [ rfc, dict() ] )
        clients_by_row.append( [ rfc, root[1].attrib['Nombre'] ] )
    row = clients[rfc][0]

    for concept in root[2]:
        _id = concept.attrib['NoIdentificacion']
        name = concept.attrib['Descripcion']

        if _id not in products:
            # new col
            products[_id] = [ len(products), name ]
            products_by_col.append( [ _id, name ] )
        col = products[_id][0]

        matrix[row][1][col] = matrix[row][1].get(col, 0) + 1

print("Loaded ", len(clients), " clients and ", len(products), " products")

Loaded  401  clients and  4771  products


In [12]:
m = np.zeros((len(clients), len(products)))

for r in range(len(matrix)):
    for c, v in matrix[r][1].items():
        m[r][c] = v

random.seed(210826)

#chosen = random.choice(list(clients.keys()))
chosen = list(clients.keys())[67]
chosen_row = clients[chosen][0]

print("Recommendations for", chosen, clients[chosen][1])

products_for_client = list(matrix[clients[chosen][0]][1].items())
random.shuffle(products_for_client)
print("Sample from", len(products_for_client),"products purchased")
for pair in products_for_client[:20]:
    print("\t", products_by_col[pair[0]][1], 'purchased', pair[1], 'times')

# similarity to each client using raw inner product as we cannot take sqrt

client_simil = np.ndarray((len(clients),1))

minus_row = -1 * m[chosen_row]

for idx in range(len(clients)):
    #client_simil[idx] = math.sqrt(np.dot(m[idx], m[chosen_row]))
    diff = m[idx] + minus_row
    client_simil[idx] = np.sum(diff*diff)

print("Top similarities:")
print(client_simil[:10])

recos = np.zeros((len(products),))

for idx in range(len(clients)):
    weight = client_simil[idx]
    row = m[idx][:]
    recos = recos + row * weight
    #if idx % 100 == 0:
    #  print("{} {:,} {:,} {:,}".format(idx, recos[0], recos[idx], recos[8200]))

recos_indexed = sorted(list(map(lambda p: [ p[1], p[0] ], enumerate(recos))))

print("Recommended:")
printed = 0
for score, col in reversed(recos_indexed):
    if m[chosen_row][col] > 0:
        continue
    print("\t({}) {} score={:,}".format(col, products_by_col[col][1], score))
    printed += 1
    if printed > 10:
        break
old_recos = recos
old_recos_indexed = recos_indexed

Recommendations for SAH9306021Q5 SERVICIOS Y ABASTECIMIENTO HIDRAULICO SA DE CV
Sample from 13 products purchased
	 Lord of the Rings: The Two Towers, The (2002) purchased 1 times
	 Shawshank Redemption, The (1994) purchased 1 times
	 Saving Private Ryan (1998) purchased 1 times
	 Forrest Gump (1994) purchased 1 times
	 Schindler's List (1993) purchased 1 times
	 Lord of the Rings: The Fellowship of the Ring, The (2001) purchased 1 times
	 Matrix, The (1999) purchased 1 times
	 Terminator 2: Judgment Day (1991) purchased 1 times
	 Incredibles, The (2004) purchased 1 times
	 Samsara (2011) purchased 1 times
	 Toy Story 3 (2010) purchased 1 times
	 Terminator, The (1984) purchased 1 times
	 Indiana Jones and the Last Crusade (1989) purchased 1 times
Top similarities:
[[218.]
 [259.]
 [ 53.]
 [202.]
 [257.]
 [106.]
 [ 42.]
 [104.]
 [ 84.]
 [206.]]
Recommended:
	(21) Pulp Fiction (1994) score=18,796.0
	(91) Star Wars: Episode IV - A New Hope (1977) score=16,201.0
	(213) Silence of the Lamb

In [13]:
def next_power_of_2(x):
    return 1 if x == 0 else 2**(x - 1).bit_length()

# CKKS related parameters
max_depth=4
scale_factor=30
batch_size=next_power_of_2(max(len(products), len(clients)) // 2 + 2)
print("batch size:", batch_size)

print("Initializing ckks wrapper")
crypto=pycrypto.CKKSwrapper()
print("Initialized wrapper")

print("Generating keys")
start_time = timeit.default_timer()
crypto.KeyGen(max_depth, scale_factor, batch_size)
print("Keys generated in", timeit.default_timer() - start_time)

print("Encrypting")
start_time = timeit.default_timer()
m = list()

for r in range(len(matrix)):
    row = [ 0.0 for _ in range(len(products)) ]
    for c, v in matrix[r][1].items():
        row[c] =  v * 1.0
    m.append( [ crypto.Encrypt( row[:batch_size] ), crypto.Encrypt( row[batch_size:] ) ] )

print("Encrypted in", timeit.default_timer() - start_time)

random.seed(210826)

# use chosen from before
#chosen = random.choice(list(clients.keys()))
#chosen_row = clients[chosen][0]

# similarity to each client using raw inner product as we cannot take sqrt

print("Negating row")
start_time = timeit.default_timer()
minus_row = [ crypto.EvalMultConst(m[chosen_row][0], [ -1.0 for _ in range(batch_size) ]),
              crypto.EvalMultConst(m[chosen_row][1], [ -1.0 for _ in range(len(products) - batch_size) ]) ]
print("Minus row in", timeit.default_timer() - start_time)

print("Calculating client similarities")
start_time = timeit.default_timer()
client_simil = list()

for idx in range(len(clients)):
    to_sum = list()
    for batch in range(2):
        diff = crypto.EvalAdd(m[idx][batch], minus_row[batch])
        to_sum.append( crypto.EvalSum( crypto.EvalMult(diff, diff),
                                       batch_size if batch == 0 else next_power_of_2(len(products) - batch_size) ) )
    dot = crypto.EvalAdd(to_sum[0], to_sum[1])
    # EvalSum does not zero the other components in batch
    dot = crypto.EvalMultConst(dot, [1 if _ == 0 else 0 for _ in range(batch_size)])
    if(idx<10):
      sim = crypto.Decrypt(dot)
      print(sim[:5])
    client_simil.append( dot )
print("Client similarities in", timeit.default_timer() - start_time)

# now broadcast
print("Broadcasting similarities")
start_time = timeit.default_timer()
for idx in range(len(clients)):
    broadcast = client_simil[idx]
    blen = 1
    while blen < batch_size:
        broadcast = crypto.EvalAdd(broadcast, crypto.EvalAtIndex(broadcast, -1 * blen))
        blen *= 2
    client_simil[idx] = broadcast
print("Broadcast in", timeit.default_timer() - start_time)

sim0 = crypto.Decrypt(client_simil[0])
print(sim0[:10])

recos = [ crypto.Encrypt( [0. ]), crypto.Encrypt( [0. ]) ]

print("Computing recommendations")
start_time = timeit.default_timer()
for idx in range(len(clients)):
    #print("client:", idx)
    weight = client_simil[idx]
    weighted = m[idx]
    #print("weight: {:,}".format(crypto.Decrypt(weight)[0]))
    #print("base pref: {:,} {:,}".format(crypto.Decrypt(weighted[0])[0], crypto.Decrypt(weighted[1])[0]))
    for batch in range(2):
        mult = crypto.EvalMult(weighted[batch], weight)
        #if batch == 0:
        #  print("mult:", crypto.Decrypt(mult)[0])
        recos[batch] = crypto.EvalAdd(recos[batch], mult)
    #print("reco: {:,} {:,}".format(crypto.Decrypt(recos[0])[0], crypto.Decrypt(recos[1])[0]) )
    #if idx % 100 == 0:
    #  r = crypto.Decrypt(recos[0]) + crypto.Decrypt(recos[1]) 
    #  print("{} {:,} {:,} {:,}".format(idx, r[0], r[idx], r[8200]))
print("Recos in", timeit.default_timer() - start_time)

# got the recommendations, now decrypt
print("Decrypting")
start_time = timeit.default_timer()
recos_decrypted = crypto.Decrypt(recos[0])[:batch_size] + crypto.Decrypt(recos[1])[:(len(products)-batch_size)]
print("Decrypted in", timeit.default_timer() - start_time)
        
recos_indexed = sorted(list(map(lambda p: [ p[1], p[0] ], enumerate(recos_decrypted))))

print("Recommendations for", chosen, clients[chosen][1])

products_for_client = list(matrix[clients[chosen][0]][1].items())
random.shuffle(products_for_client)
print("Sample from", len(products_for_client),"products purchased")
for pair in products_for_client[:20]:
    print("\t", products_by_col[pair[0]][1], 'purchased', pair[1], 'times')

print("Recommended:")
printed = 0
for score, col in reversed(recos_indexed):
    if col in matrix[chosen_row][1]:
        continue
    print("\t({}) {} score={:,}".format(col, products_by_col[col][1], score))        
    printed += 1
    if printed > 10:
        break

batch size: 4096
Initializing ckks wrapper
Initialized wrapper
Generating keys
Keys generated in 1.98000945900003
Encrypting
Encrypted in 26.604011954000043
Negating row
Minus row in 0.023708063999947626
Calculating client similarities
[218.00002845586565, -1.1095056383686995e-06, -6.521090353022298e-06, 1.646711536181897e-05, 2.218659103911474e-05]
[259.0015988085203, 1.8458553864497846e-05, -3.0381243461272766e-06, 2.1504791932246735e-05, 3.662876710731617e-05]
[52.99927212523427, 4.91285038563258e-06, -8.448083291480004e-07, 8.069994773233134e-06, 1.4445040774299998e-05]
[201.9995410181425, 1.224326758569803e-05, 4.489184716035984e-06, 5.034165573473328e-06, 1.6404809786238127e-05]
[257.00053321043475, 5.005021044064684e-05, 1.856178797484285e-05, -7.50515137159526e-06, 1.2247655477585526e-05]
[105.9983025141432, 7.297278355461417e-06, -6.173807963080904e-07, 5.101778513376939e-06, 5.844522523103289e-06]
[42.00141579194573, 1.9995676825060753e-06, -1.7600219732409166e-06, 4.49279530