# Designing a Pipeline

To get setup install the recommended packages from the `requirements.txt` file. 

In [None]:
#!pip install -r requirements.txt

Refer to the book to cover the chapters on getting the data ingested into the Neo4J instance. This will focus on the Python elements of the process. 

## Simulating customer interactions

In [2]:
from faker import Faker

fake = Faker()
Faker.seed(0)

In [3]:
name = fake.name()
address = fake.address()

print(name)
print(address)

Norma Fisher
4759 William Haven Apt. 194
West Corey, CA 90152


Here we will create a custom class to create some fake information based on the customerID. 

In [4]:
from datetime import datetime

class Customer:
	def __init__(self, customerID):
		self.customerID = customerID
		self.name = fake.name()
		self.address = fake.address()
		self.datetime = datetime.now().strftime("%d/%m/%Y-%H:%M:%S")

	# Custom string override (dunder method) to print when print on new customer class is instantiated
	def __str__(self):
		return f'The customer ID is: {self.customerID}\nCustoner Name is: {self.name}\nCustomer address: {self.address}\nCreated on: {self.datetime}'
   

In [5]:
new_customer = Customer(customerID=9999)
print(new_customer)

The customer ID is: 9999
Custoner Name is: Justin Gomez
Customer address: 778 Brown Plaza
North Jenniferfurt, VT 88077
Created on: 19/10/2023-16:30:11


### Create connection to Neo4J instance

In [6]:
from neo4j import GraphDatabase

class Neo4jConnect:

    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))
        
    def close(self):
        self.driver.close()
        
    def query(self, query):
        session = self.driver.session()
        result = session.run(query)
        return result


For convenience - this has also been saved into the supporting package we have created. 

In [7]:
from graphtastic.database.neo4j import Neo4jConnect

Next we will add customers to our Neo4J instance using the faked data:

In [8]:
def add_customer(c, connection):
    query = f'MERGE (:Customer {{customerID: toInteger({c.customerID}), ' \
    		f'name: "{c.name}", ' \
    		f'address: "{c.address}"}})'
    connection.query(query)


Test this function out with our faked data. 

In [9]:
test_customer = Customer(9999)
connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
add_customer(test_customer, connection)
connection.close()

Run this code in Neo4J Browser:

In [10]:
%%writefile cypher/return_customers.cql
MATCH (c:Customer) RETURN c

Overwriting cypher/return_customers.cql


In [11]:
def add_purchase(c, productID, time, connection):
	query = f'MATCH (c:Customer {{customerID: toInteger({c.customerID})}}) ' \
			f'MATCH (p:Product {{productID: toInteger({productID})}}) ' \
			f'MERGE (c)-[:PURCHASED {{datetime:"{time}"}}]->(p)'
	connection.query(query)


In [12]:
from datetime import datetime

connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
add_purchase(test_customer, 1010, datetime.now(), connection)
connection.close()


In [13]:
%%writefile cypher/edge_bt_cust_and_purch_nodes.cql
MATCH (c:Customer)-[:PURCHASED]->(p:Product) RETURN c, p

Overwriting cypher/edge_bt_cust_and_purch_nodes.cql


Add another purchase to the graph database:

In [14]:
connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
add_purchase(test_customer, 1010, datetime.now(), connection)
connection.close()

This code will save the script we will run in Neo4J Browser:

In [15]:
%%writefile cypher/get_cust_and_prod.cql
MATCH (c:Customer)-[purchase:PURCHASED]->(p:Product) 
RETURN c.name, p.name, purchase.datetime

Overwriting cypher/get_cust_and_prod.cql


We will purge out database data with the below Cypher that will be execute in the Neo4J Browser:

In [17]:
%%writefile cypher/purge_data.cql
MATCH (c:Customer) DETACH DELETE c

Overwriting cypher/purge_data.cql


In [18]:
def get_product_ids(connection):
	query = 'MATCH (p:Product) RETURN p.productID as productID'
	result = connection.query(query).data()
	result = [product['productID'] for product in result]
	return result

In [19]:
connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
product_ids = get_product_ids(connection)
connection.close()
print(product_ids)
print(len(product_ids))

[1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027]
27


Generate lots of customers:

In [20]:
customers = [Customer(customerID) for customerID in range(10000, 10100)]

In [21]:
for customer in customers:
    connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
    add_customer(customer, connection)
    connection.close()

In [22]:
import random

for customer in customers:
    # Print out the customer and add a line print for 
    print(customer, '\n----------------------------------------------------')
    product = random.choice(product_ids)
    connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
    add_purchase(customer, product, datetime.now(), connection)
    connection.close()

The customer ID is: 10000
Custoner Name is: Gary Nielsen
Customer address: 1965 Kelly Field Apt. 094
Jonesberg, FM 20102
Created on: 19/10/2023-16:34:13 
----------------------------------------------------
The customer ID is: 10001
Custoner Name is: Sarah Villanueva
Customer address: PSC 8684, Box 8339
APO AA 76482
Created on: 19/10/2023-16:34:13 
----------------------------------------------------
The customer ID is: 10002
Custoner Name is: Johnathan Davidson
Customer address: 59179 Bruce Gardens Apt. 413
Lauramouth, NE 08652
Created on: 19/10/2023-16:34:13 
----------------------------------------------------
The customer ID is: 10003
Custoner Name is: Charles Wolfe
Customer address: 13991 Davis Village
North Catherineborough, HI 94625
Created on: 19/10/2023-16:34:13 
----------------------------------------------------
The customer ID is: 10004
Custoner Name is: Daniel Owen
Customer address: USS Stephens
FPO AP 13804
Created on: 19/10/2023-16:34:13 
-------------------------------

In [23]:
from time import sleep
sleep(1)

# Making Recommendations

In [None]:
def rec_by_brand(c, connection):
    query = f'MATCH (c:Customer {{customerID: toInteger({c.customerID})}})' \
            '-[:PURCHASED]->(p:Product)' \
            'MATCH (p)-[:HAS_BRAND]->(b:Brand)' \
            'MATCH (b)<-[:HAS_BRAND]-(r:Product)' \
            'WHERE NOT (c)-[:PURCHASED]->(r)' \
            'RETURN DISTINCT r.productID as productID'
    
    result = connection.query(query).data()
    result = [product['productID'] for product in result]


    return result

In [None]:
connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
print(connection)

In [None]:
brand_recommendations = rec_by_brand(customers[10], connection)
connection.close()
print(brand_recommendations)

Use the Cypher in Neo4J Browser to get the results:

In [None]:
%%writefile cypher/cust_brand_prod.cql
MATCH (c:Customer {customerID:10010})-[:PURCHASED]->(p:Product)
MATCH (p)-[:HAS_BRAND]->(b:Brand)
MATCH (b)<-[:HAS_BRAND]-(r:Product)
WHERE NOT (c)-[:PURCHASED]->(r)
RETURN c, p, b, r

In [None]:
for customer in customers:
    connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
    brand_recommendations = rec_by_brand(customer, connection)
    connection.close()
    if len(brand_recommendations) != 0:
        product = random.choice(brand_recommendations)
        connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
        add_purchase(customer, product, datetime.now(), connection)
        connection.close()


Execute cypher to see if this has worked correctly:

In [None]:
%%writefile cypher/distinct_product_match.cql
MATCH (c:Customer {customerID:10010})-[:PURCHASED]->(p:Product)
MATCH (p)-[:HAS_TYPE]->(st:Type)
MATCH (p)-[:HAS_BRAND]->(b:Brand)
MATCH (b)<-[:HAS_BRAND]-(r:Product)
WHERE NOT (c)-[:PURCHASED]->(r) AND (r)-[:HAS_TYPE]->(st)
RETURN DISTINCT r


## Drawing on other customers purchases

In [None]:
def rec_by_copurchase(c, connection):
    query = f'MATCH (c:Customer {{customerID: toInteger({c.customerID})}})' \
            '-[:PURCHASED]->(p:Product)' \
            'MATCH (p)<-[:PURCHASED]-(c2:Customer)' \
            'WHERE c2 <> c ' \
            'MATCH (c2)-[:PURCHASED]->(r:Product)' \
            'WHERE p <> r ' \
            'RETURN DISTINCT r.productID as productID'
    result = connection.query(query).data()
    result = [product['productID'] for product in result]
    return result

In [None]:
connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
copurchase_recommendations = rec_by_copurchase(customers[10], connection)
connection.close()
print(copurchase_recommendations)

In [None]:
for customer in customers:
    connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
    copurchase_recommendations = rec_by_copurchase(customer, connection)
    connection.close()
    if len(copurchase_recommendations) != 0:
        product = random.choice(copurchase_recommendations)
        connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
        add_purchase(customer, product, datetime.now(), connection)
        connection.close()


## Using similarity scores to recommend products

In [None]:
def get_customer_purchases(c_id, connection):
    query = f'MATCH (c:Customer {{customerID: toInteger({c_id})}})' \
            '-[:PURCHASED]->(p:Product)' \
            'RETURN DISTINCT p.productID as productID'
    result = connection.query(query).data()
    result = [product['productID'] for product in result]
    return result

In [None]:
connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
purchases = get_customer_purchases(customers[10].customerID, connection)
connection.close()
print(purchases)

This code to be run in `Neo4J Browser`:

In [None]:
%%writefile cypher/cust_purchases.cql
MATCH (c:Customer {customerID:10010})-[:PURCHASED]-(p:Product)
RETURN DISTINCT p.productID as productID

In [None]:
connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
all_purchases = {customer.customerID: get_customer_purchases(customer.customerID, connection) 
                 for customer in customers}

In [None]:
print(all_purchases)

In [None]:
def jaccard_similarity(list1, list2):
    set1 = set(list1)
    set2 = set(list2)
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    jaccard = len(intersection) / len(union)
    return jaccard

In [None]:
list1 = [1, 2, 3, 4]
list2 = [5, 6, 7, 8]
list3 = [1, 2]
assert jaccard_similarity(list1, list1) == 1
assert jaccard_similarity(list1, list2) == 0
assert jaccard_similarity(list1, list3) == 0.5

In [None]:
import itertools
customer_ids = list(all_purchases.keys())
print(customer_ids)
customer_pairs = list(itertools.combinations(customer_ids, 2))
print(customer_pairs[:10])

In [None]:
similarity = {pair: jaccard_similarity(all_purchases[pair[0]], all_purchases[pair[1]]) for pair in customer_pairs}
print(similarity)

In [None]:
from collections import Counter
grouped_similiarities = Counter(similarity.values())
print(grouped_similiarities)

In [None]:
def add_purchase_id(c_id, productID, time, connection):
	query = f'MATCH (c:Customer {{customerID: toInteger({c_id})}}) ' \
			f'MATCH (p:Product {{productID: toInteger({productID})}}) ' \
			f'MERGE (c)-[:PURCHASED {{datetime:"{time}"}}]->(p)'
	connection.query(query)

In [None]:
threshold = 0.7

In [None]:
def rec_by_similarity(c1, c2, threshold, connection):
    p1 = get_customer_purchases(c1, connection)
    p2 = get_customer_purchases(c2, connection)
    similarity = jaccard_similarity(p1, p2)
    if similarity >= threshold and similarity != 1:
        p1_recs = [p for p in p2 if p not in p1]
        p2_recs = [p for p in p1 if p not in p2]
        for p in p1_recs:
            add_purchase_id(c1, p, datetime.now(), connection)
        for p in p2_recs:
            add_purchase_id(c2, p, datetime.now(), connection)

In [None]:
for pair in customer_pairs:
    connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
    rec_by_similarity(pair[0], pair[1], 0.7, connection)
    connection.close()

## Complete Python script

In [None]:
%%writefile pipeline_dev.py
from faker import Faker
from datetime import datetime
from graphtastic.database.neo4j import Neo4jConnect
import random

# Fake some data
fake = Faker()
Faker.seed(0)
# Get faker atrributes
name = fake.name()
address = fake.address()
print(name)
print(address)

# Create a class we will use to generate new customers
class Customer:
	def __init__(self, customerID):
		self.customerID = customerID
		self.name = fake.name()
		self.address = fake.address()
		self.datetime = datetime.now().strftime("%d/%m/%Y-%H:%M:%S")

	# Custom string override (dunder method) to print when print on new customer class is instantiated
	def __str__(self):
		return f'The customer ID is: {self.customerID}\nCustoner Name is: {self.name}\nCustomer address: {self.address}\nCreated on: {self.datetime}'
   

# Instantiate customer
new_customer = Customer(customerID=9999)
print(new_customer)

# Create add customer function
def add_customer(c, connection):
    query = f'MERGE (:Customer {{customerID: toInteger({c.customerID}), ' \
    		f'name: "{c.name}", ' \
    		f'address: "{c.address}"}})'
    connection.query(query)

# Create customer in Neo4J
test_customer = Customer(9999)
connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
add_customer(test_customer, connection)
connection.close()

# Create function to add a purchase to Neo4J
def add_purchase(c, productID, time, connection):
	query = f'MATCH (c:Customer {{customerID: toInteger({c.customerID})}}) ' \
			f'MATCH (p:Product {{productID: toInteger({productID})}}) ' \
			f'MERGE (c)-[:PURCHASED {{datetime:"{time}"}}]->(p)'
	connection.query(query)

# Use add purchase to add purchases for our test customer
connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
add_purchase(test_customer, 1010, datetime.now(), connection)
connection.close()

# Add another purchase
connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
add_purchase(test_customer, 1010, datetime.now(), connection)
connection.close()

# Create function to get product ids
def get_product_ids(connection):
	query = 'MATCH (p:Product) RETURN p.productID as productID'
	result = connection.query(query).data()
	result = [product['productID'] for product in result]
	return result

# Get product IDS from Neo4J
connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
product_ids = get_product_ids(connection)
connection.close()
print(product_ids)
print(len(product_ids))

# Generate lots of customers
customers = [Customer(customerID) for customerID in range(10000, 10100)]
for customer in customers:
    connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
    add_customer(customer, connection)
    connection.close()

# Create purchases at random for our customers
for customer in customers:
    # Print out the customer and add a line print for 
    print(customer, '\n----------------------------------------------------')
    product = random.choice(product_ids)
    connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
    add_purchase(customer, product, datetime.now(), connection)
    connection.close()

# MAKING RECOMMENDATIONS
def rec_by_brand(c, connection):
    query = f'MATCH (c:Customer {{customerID: toInteger({c.customerID})}})' \
            '-[:PURCHASED]->(p:Product)' \
            'MATCH (p)-[:HAS_BRAND]->(b:Brand)' \
            'MATCH (b)<-[:HAS_BRAND]-(r:Product)' \
            'WHERE NOT (c)-[:PURCHASED]->(r)' \
            'RETURN DISTINCT r.productID as productID'
    
    result = connection.query(query).data()
    result = [product['productID'] for product in result]
    return result

connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
print(connection)

# Use function to get recommendations by brand
brand_recommendations = rec_by_brand(customers[10], connection)
connection.close()
print(brand_recommendations)

for customer in customers:
    connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
    brand_recommendations = rec_by_brand(customer, connection)
    connection.close()
    if len(brand_recommendations) != 0:
        product = random.choice(brand_recommendations)
        connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
        add_purchase(customer, product, datetime.now(), connection)
        connection.close()

# Drawing on other customers purchases
def rec_by_copurchase(c, connection):
    query = f'MATCH (c:Customer {{customerID: toInteger({c.customerID})}})' \
            '-[:PURCHASED]->(p:Product)' \
            'MATCH (p)<-[:PURCHASED]-(c2:Customer)' \
            'WHERE c2 <> c ' \
            'MATCH (c2)-[:PURCHASED]->(r:Product)' \
            'WHERE p <> r ' \
            'RETURN DISTINCT r.productID as productID'
    result = connection.query(query).data()
    result = [product['productID'] for product in result]
    return result

# Look at recommending products based on other customers recommendations
connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
copurchase_recommendations = rec_by_copurchase(customers[10], connection)
connection.close()
print(copurchase_recommendations)

for customer in customers:
    connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
    copurchase_recommendations = rec_by_copurchase(customer, connection)
    connection.close()
    if len(copurchase_recommendations) != 0:
        product = random.choice(copurchase_recommendations)
        connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
        add_purchase(customer, product, datetime.now(), connection)
        connection.close()

# Use similarity scores to recommend products
def get_customer_purchases(c_id, connection):
    query = f'MATCH (c:Customer {{customerID: toInteger({c_id})}})' \
            '-[:PURCHASED]->(p:Product)' \
            'RETURN DISTINCT p.productID as productID'
    result = connection.query(query).data()
    result = [product['productID'] for product in result]
    return result

connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
purchases = get_customer_purchases(customers[10].customerID, connection)
connection.close()
print(purchases)

connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
all_purchases = {customer.customerID: get_customer_purchases(customer.customerID, connection) 
                 for customer in customers}

print(all_purchases)

# Create Jaccard similarity to look at similar purchases
def jaccard_similarity(list1, list2):
    set1 = set(list1)
    set2 = set(list2)
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    jaccard = len(intersection) / len(union)
    return jaccard

list1 = [1, 2, 3, 4]
list2 = [5, 6, 7, 8]
list3 = [1, 2]
assert jaccard_similarity(list1, list1) == 1
assert jaccard_similarity(list1, list2) == 0
assert jaccard_similarity(list1, list3) == 0.5

import itertools
customer_ids = list(all_purchases.keys())
print(customer_ids)
customer_pairs = list(itertools.combinations(customer_ids, 2))
print(customer_pairs[:10])

# Get similarity
similarity = {pair: jaccard_similarity(all_purchases[pair[0]], all_purchases[pair[1]]) for pair in customer_pairs}
print(similarity)

from collections import Counter
grouped_similiarities = Counter(similarity.values())
print(grouped_similiarities)

def add_purchase_id(c_id, productID, time, connection):
	query = f'MATCH (c:Customer {{customerID: toInteger({c_id})}}) ' \
			f'MATCH (p:Product {{productID: toInteger({productID})}}) ' \
			f'MERGE (c)-[:PURCHASED {{datetime:"{time}"}}]->(p)'
	connection.query(query)

threshold = 0.7

def rec_by_similarity(c1, c2, threshold, connection):
    p1 = get_customer_purchases(c1, connection)
    p2 = get_customer_purchases(c2, connection)
    similarity = jaccard_similarity(p1, p2)
    if similarity >= threshold and similarity != 1:
        p1_recs = [p for p in p2 if p not in p1]
        p2_recs = [p for p in p1 if p not in p2]
        for p in p1_recs:
            add_purchase_id(c1, p, datetime.now(), connection)
        for p in p2_recs:
            add_purchase_id(c2, p, datetime.now(), connection)

for pair in customer_pairs:
    connection = Neo4jConnect('bolt://localhost:7687', 'admin', 'testpython')
    rec_by_similarity(pair[0], pair[1], 0.7, connection)
    connection.close()