Copyright (C) 2021 Textualization Software Ltd. Distributed under the terms of the [Apache Software License 2.0](http://www.apache.org/licenses/LICENSE-2.0).
---

<a href="https://colab.research.google.com/github/Textualization/riiaa21_ws11_ml_over_encrypted_data/blob/main/notebooks/4_Palisade_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



In [None]:
!rm -rf riiaa21_ws11_ml_over_encrypted_data
!git clone https://github.com/Textualization/riiaa21_ws11_ml_over_encrypted_data!cp /content/riiaa21_ws11_ml_over_encrypted_data/dependencies/palisade-install.tar.gz /content; cd /content; tar -xzf palisade-install.tar.gz



In [None]:
!cd data; tar -xzf synthetic-invoices.tar.gz



In [None]:
!mkdir -p /content/riiaa21_ws11_ml_over_encrypted_data/build/lib
!cd /content/riiaa21_ws11_ml_over_encrypted_data; c++ -fopenmp -fPIC -std=gnu++11 `python3.7-config --cflags` `python3.7-config --includes` -I/content/palisade-install/include/palisade{,/core,/pke,/third-party/inclue} -o build/lib/ckks_wrapper.o -c src/ckks_wrapper.cpp 
!cd /content/riiaa21_ws11_ml_over_encrypted_data; c++ -fopenmp -fPIC -std=gnu++11 `python3.7-config --cflags` `python3.7-config --includes` -I/content/palisade-install/include/palisade{,/core,/pke,/third-party/inclue} -o build/lib/pycrypto.o -c src/pycrypto.cpp 
!cd /content/riiaa21_ws11_ml_over_encrypted_data; /usr/bin/c++ -fPIC  -Wall -Werror -O3  `python3.7-config --ldflags` -DPALISADE_VERSION=1.11.4 -Wno-parentheses -fopenmp -shared -Wl,-soname,pycrypto.so.1 -lpython3.7m -o build/lib/pycrypto.so.1 build/lib/ckks_wrapper.o build/lib/pycrypto.o   -L/content/palisade-install/lib  -Wl,-rpath,/content/palisade-install/lib:  /content/palisade-install/lib/libPALISADEcore.so.1 /content/palisade-install/lib/libPALISADEbinfhe.so.1 /content/palisade-install/lib/libPALISADEpke.so.1  /usr/lib/x86_64-linux-gnu/libpython3.7m.so /usr/lib/x86_64-linux-gnu/libboost_python3-py36.so 
!cd /content/riiaa21_ws11_ml_over_encrypted_data/build/lib; ln -s pycrypto.so.1 pycrypto.so!cd /content/riiaa21_ws11_ml_over_encrypted_data/data; tar -xzf synthetic-invoices.tar.gz



In [None]:
import sys
sys.path.append('/content/riiaa21_ws11_ml_over_encrypted_data/build/lib')

import pycrypto

In [None]:
import os
import os.path
import random
import math

import xml.etree.ElementTree as ET

import numpy as np
import timeit

In [None]:
DATA_DIR='/content/riiaa21_ws11_ml_over_encrypted_data/data/invoices'

matrix           = list() # rows = [ rfc, pref dict id->count ]
clients          = dict() # rfc -> [ row in matrix, name ]
clients_by_row   = list() # row -> [ rfc, name ]
products         = dict() # id -> [ col in matrix, name ]
products_by_col  = list() # col -> [ id, name ]

for invoice in os.listdir(DATA_DIR):
    if '.xml' not in invoice:
        continue
    tree = ET.parse(os.path.join(DATA_DIR, invoice))
    root = tree.getroot()
    rfc = root[1].attrib['Rfc'] # Receptor
    if rfc not in clients:
        # new row
        clients[rfc] = [ len(matrix), root[1].attrib['Nombre'] ]
        matrix.append( [ rfc, dict() ] )
        clients_by_row.append( [ rfc, root[1].attrib['Nombre'] ] )
    row = clients[rfc][0]

    for concept in root[2]:
        _id = concept.attrib['NoIdentificacion']
        name = concept.attrib['Descripcion']

        if _id not in products:
            # new col
            products[_id] = [ len(products), name ]
            products_by_col.append( [ _id, name ] )
        col = products[_id][0]

        matrix[row][1][col] = matrix[row][1].get(col, 0) + 1

print("Loaded ", len(clients), " clients and ", len(products), " products")

Loaded  609  clients and  8452  products


In [None]:
m = np.zeros((len(clients), len(products)))

for r in range(len(matrix)):
    for c, v in matrix[r][1].items():
        m[r][c] = v

random.seed(210826)

#chosen = random.choice(list(clients.keys()))
chosen = list(clients.keys())[67]
chosen_row = clients[chosen][0]

print("Recommendations for", chosen, clients[chosen][1])

products_for_client = list(matrix[clients[chosen][0]][1].items())
random.shuffle(products_for_client)
print("Sample from", len(products_for_client),"products purchased")
for pair in products_for_client[:20]:
    print("\t", products_by_col[pair[0]][1], 'purchased', pair[1], 'times')

# similarity to each client using raw inner product as we cannot take sqrt

client_simil = np.ndarray((len(clients),1))

for idx in range(len(clients)):
    client_simil[idx] = math.sqrt(np.dot(m[idx], m[chosen_row]))
    #client_simil[idx] = np.dot(m[idx], m[chosen_row])

recos = np.zeros((len(products),))

for idx in range(len(clients)):
    weight = client_simil[idx]
    row = m[idx][:]
    recos = recos + row * weight / len(clients)

recos_indexed = sorted(list(map(lambda p: [ p[1], p[0] ], enumerate(recos))))

print("Recommended:")
printed = 0
for score, col in reversed(recos_indexed):
    if m[chosen_row][col] > 0:
        continue
    print("\t", products_by_col[col][1], "score", score)
    printed += 1
    if printed > 10:
        break

Recommendations for CON0610049C2 CONAFIP SC
Sample from 774 products purchased
	 Anniversary Party, The (2001) purchased 3 times
	 American Movie (1999) purchased 1 times
	 Sabrina (1954) purchased 3 times
	 Casablanca (1942) purchased 4 times
	 Minus Man, The (1999) purchased 2 times
	 Eyes Wide Shut (1999) purchased 1 times
	 Three Kings (1999) purchased 1 times
	 Withnail & I (1987) purchased 3 times
	 Star Maps (1997) purchased 2 times
	 Body Heat (1981) purchased 3 times
	 Four Days in September (O Que Ã Isso, Companheiro?) (1997) purchased 5 times
	 Mrs. Parker and the Vicious Circle (1994) purchased 3 times
	 Butcher Boy, The (1997) purchased 4 times
	 Romeo Must Die (2000) purchased 1 times
	 Straight Story, The (1999) purchased 2 times
	 Daughters of the Dust (1991) purchased 3 times
	 Flawless (1999) purchased 3 times
	 Emperor and the Assassin, The (Jing ke ci qin wang) (1999) purchased 1 times
	 Fistful of Dollars, A (Per un pugno di dollari) (1964) purchased 3 times
	 Red

In [None]:
def next_power_of_2(x):
    return 1 if x == 0 else 2**(x - 1).bit_length()

# CKKS related parameters
max_depth=3
scale_factor=50
batch_size=next_power_of_2(max(len(products), len(clients)) // 2 + 2)
print("batch size:", batch_size)

print("Initializing ckks wrapper")
crypto=pycrypto.CKKSwrapper()
print("Initialized wrapper")

print("Generating keys")
start_time = timeit.default_timer()
crypto.KeyGen(max_depth, scale_factor, batch_size)
print("Keys generated in", timeit.default_timer() - start_time)

print("Encrypting")
start_time = timeit.default_timer()
m = list()

for r in range(len(matrix)):
    row = [ 0.0 for idx in range(len(products)) ]
    for c, v in matrix[r][1].items():
        row[c] =  v * 1.0
    m.append( [ crypto.Encrypt( row[:batch_size] ), crypto.Encrypt( row[batch_size:] ) ] )

print("Encrypted in", timeit.default_timer() - start_time)

random.seed(210826)

chosen = random.choice(list(clients.keys()))
chosen_row = clients[chosen][0]

# similarity to each client using raw inner product as we cannot take sqrt

print("Negating row")
start_time = timeit.default_timer()
minus_row = [ crypto.EvalMultConst(m[chosen_row][0], [ -1.0 for _ in range(batch_size) ]),
              crypto.EvalMultConst(m[chosen_row][0], [ -1.0 for _ in range(batch_size) ]) ]
print("Minus row in", timeit.default_timer() - start_time)


print("Calculating client similarities")
start_time = timeit.default_timer()
client_simil = list()

for idx in range(len(clients)):
    to_sum = list()
    for batch in range(2):
        diff = crypto.EvalAdd(m[idx][batch], minus_row[batch])
        to_sum.append( crypto.EvalSum( crypto.EvalMult(diff, diff),
                                       batch_size if batch == 0 else next_power_of_2(len(products) - batch_size) ) )
    dot = crypto.EvalAdd(to_sum[0], to_sum[1])
    client_simil.append( dot )
print("Client similarities in", timeit.default_timer() - start_time)

# now broadcast
print("Broadcasting similarities")
start_time = timeit.default_timer()
for idx in range(len(clients)):
    broadcast = client_simil[idx]
    blen = 1
    while blen < batch_size:
        broadcast = crypto.EvalAdd(broadcast, crypto.EvalAtIndex(broadcast, -1 * blen))
        blen *= 2
    client_simil[idx] = broadcast
print("Broadcast in", timeit.default_timer() - start_time)

recos = [ crypto.Encrypt( [0. ]), crypto.Encrypt( [0. ]) ]

print("Computing recommendations")
start_time = timeit.default_timer()
for idx in range(len(clients)):
    weight = client_simil[idx]
    weighted = m[idx]
    for batch in range(2):
        mult = crypto.EvalMult(m[idx][batch], weight)
        recos[batch] = crypto.EvalAdd(recos[batch], mult)
print("Recos in", timeit.default_timer() - start_time)

# got the recommendations, now decrypt

recos_decrypted = crypto.Decrypt(recos[0])[:batch_size] + crypto.Decrypt(recos[1])[:(len(products)-batch_size)]
        
print("Decrypting")
start_time = timeit.default_timer()
recos_indexed = sorted(list(map(lambda p: [ p[1], p[0] ], enumerate(recos_decrypted))))
print("Decrypted in", timeit.default_timer() - start_time)

print("Recommendations for", chosen, clients[chosen][1])

products_for_client = list(matrix[clients[chosen][0]][1].items())
random.shuffle(products_for_client)
print("Sample from", len(products_for_client),"products purchased")
for pair in products_for_client[:20]:
    print("\t", products_by_col[pair[0]][1], 'purchased', pair[1], 'times')

print("Recommended:")
printed = 0
for score, col in reversed(recos_indexed):
    if matrix[chosen_row][1].get(products_by_col[col][0], 0) > 0:
        continue
    print("\t", products_by_col[col][1], "score", score)
    printed += 1
    if printed > 10:
        break

batch size: 8192
Initializing ckks wrapper
Initialized wrapper
Generating keys
Keys generated in 0.807337939000007
Encrypting
Encrypted in 34.515163927
Negating row
Minus row in 0.01962456100000054
Calculating client similarities


RuntimeError: ignored