In [None]:
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

# Creating a Scalable Volunteer-Project Recommender for WeServe

In this notebook, you will create a recommendation engine using Spark and Elasticsearch. Volunteer-Project's weighted score data is used to train a collaborative filtering model in Spark and export the trained model to Elasticsearch. Once exported, you can test your recommendations by querying Elasticsearch and displaying the results.

### _Prerequisites_

The notebook assumes you have installed Elasticsearch, the Elasticsearch vector-scoring plugin, Apache Spark and the Elasticsearch Spark connector detailed in the [setup steps](https://github.com/MLnick/elasticsearch-spark-recommender-demo/tree/master#steps).

## Overview

You will work through the following steps

1. Prepare the data
2. Use the Elasticsearch Spark connector to save it to Elasticsearch
3. Load weighted score data and train a collaborative filtering recommendation model using Spark MLlib
3. Save the model to Elasticsearch
4. Show recommendations using Elasticsearch vector scoring plugin

In [1]:
# first import a few utility methods that we'll use later on
from IPython.display import Image, HTML, display
# check PySpark is running
spark

### ES date needs to be in epoch + millisec only

In [2]:
PATH_TO_DATA = "/media/sf_mac_downloads"
# load ratings data
score = spark.read.csv(PATH_TO_DATA + "/WeightedScore.csv", header=True, inferSchema=True)
score.cache()
print("Number of scores: %i" % score.count())
print("Sample of scores:")
score.show(5)

Number of scores: 14080
Sample of scores:
+------+---------+-----+-------------+
|userId|projectId|score|    epochtime|
+------+---------+-----+-------------+
|     1|        1| 0.59|1489770485000|
|     1|        2| 3.93|1434876168000|
|     1|        3| 4.42|1456560157000|
|     1|        4| 3.68|1510301386000|
|     1|        5|  0.1|1483925429000|
+------+---------+-----+-------------+
only showing top 5 rows



In [3]:
projectInfo = spark.read.csv("/home/thol/Project-Team-13/mysql_bkp/projects.csv", header=True, inferSchema=True)
projectInfo.cache()
print("Number of Projects: %i" % projectInfo.count())
print("Sample of projects:")
projectInfo.show(5)

Number of Projects: 128
Sample of projects:
+---+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| id|ngoUserId|             ngoName|              region|                name|               scope|                need|       beneficiaries|             funding|             contact|            imageUrl|
+---+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  1|        7|Jayaprakash Naray...|Noida, Uttar Prad...|Vidya & Child: Ar...|Exposure to art i...|The JP Narayan Tr...|                250 |       INR 4,94,000 |jawahar.k@cii.in ...|https://source.un...|
|  2|       87| Deep Griha Society |3 urban slums: Pu...|Creches for worki...|This project is d...|The benefits of t...|300 families; ie

In [4]:
users = spark.read.csv(PATH_TO_DATA + "/Users.csv", header=True, inferSchema=True)
users.cache()
print("Number of Users: %i" % users.count())
print("Sample of Users:")
users.show(5)

Number of Users: 117
Sample of Users:
+------+----------+--------+-----------+--------------------+---------+--------+--------------------+--------------------+----------+------------+-----------+----------+
|userID|  userType|username|   password|               email|firstName|lastName|               image|         description|    region|  attribute0| attribute1|attribute2|
+------+----------+--------+-----------+--------------------+---------+--------+--------------------+--------------------+----------+------------+-----------+----------+
|     1|consultant|AGAJLZ69|XTU11ROA3MK|magna.sed@quislec...|      Lev|  Wilcox|https://source.un...|lectus rutrum urn...|        Z.|    children|   disaster|disability|
|     2|       ngo|BUBGNC71|SXP10DDC8HX|fringilla@consect...| Germaine|   Hayes|https://source.un...|luctus. Curabitur...|        AB|   education|environment|      lgbt|
|     3|consultant|XLZNRZ45|GEV31LTC7IE|Curabitur.ut@fauc...|      Leo|    Huff|https://source.un...|commodo hen

In [5]:
from elasticsearch import Elasticsearch

# test your ES instance is running
es = Elasticsearch()
es.info(pretty=True)

{u'cluster_name': u'elasticsearch',
 u'cluster_uuid': u'vtmKKHjSRn2fYuFt6gyrzQ',
 u'name': u'TTp40ht',
 u'tagline': u'You Know, for Search',
 u'version': {u'build_date': u'2017-03-23T03:31:50.652Z',
  u'build_hash': u'3adb13b',
  u'build_snapshot': False,
  u'lucene_version': u'6.4.1',
  u'number': u'5.3.0'}}

In [6]:
es.indices.delete(index="weserve")

{u'acknowledged': True}

In [7]:
create_index = {
    "settings": {
        "analysis": {
            "analyzer": {
                # this configures the custom analyzer we need to parse vectors such that the scoring
                # plugin will work correctly
                "payload_analyzer": {
                    "type": "custom",
                    "tokenizer":"whitespace",
                    "filter":"delimited_payload_filter"
                }
            }
        }
    },
    "mappings": {
        "score": {
          # this mapping definition sets up the fields for the rating events
          "properties": {
                "userId": {
                    "type": "integer"
                },
                "projectId": {
                    "type": "integer"
                },
                "score": {
                    "type": "double"
                },
                "epochtime": {
                    "type": "date"
                }
            }  
        },
        "users": {
            # this mapping definition sets up the metadata fields for the users
            "properties": {
                "@model": {
                    # this mapping definition sets up the fields for user factor vectors of our model
                    "properties": {
                        "factor": {
                            "type": "text",
                            "term_vector": "with_positions_offsets_payloads",
                            "analyzer" : "payload_analyzer"
                        },
                        "version": {
                            "type": "keyword"
                        },
                        "timestamp": {
                            "type": "date"
                        }
                    }
                }
            }
        },
        "projects": {
          # this mapping definition sets up the fields for the rating events
          "properties": {
                "@model": {
                    # this mapping definition sets up the fields for project factor vectors of our model
                    "properties": {
                        "factor": {
                            "type": "text",
                            "term_vector": "with_positions_offsets_payloads",
                            "analyzer" : "payload_analyzer"
                        },
                        "version": {
                            "type": "keyword"
                        },
                        "timestamp": {
                            "type": "date"
                        }
                    }
                }
            }  
        }
    }
}
# create index with the settings and mappings above
es.indices.create(index="weserve", body=create_index)

{u'acknowledged': True, u'shards_acknowledged': True}

In [8]:
# write ratings data
score.write.format("es").save("weserve/score")
# check write went ok
print("Dataframe count: %d" % score.count())
print("ES index count:  %d" % es.count(index="weserve", doc_type="score")['count'])

Dataframe count: 14080
ES index count:  14080


In [9]:
# test things out by retrieving a few rating event documents from Elasticsearch
es.search(index="weserve", doc_type="score", q="*", size=3)

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
 u'hits': {u'hits': [{u'_id': u'AWMjo_RVKiw5AfXZ8gux',
    u'_index': u'weserve',
    u'_score': 1.0,
    u'_source': {u'epochtime': 1438601747000,
     u'projectId': 7,
     u'score': 2.59,
     u'userId': 1},
    u'_type': u'score'},
   {u'_id': u'AWMjo_RVKiw5AfXZ8guy',
    u'_index': u'weserve',
    u'_score': 1.0,
    u'_source': {u'epochtime': 1446743909000,
     u'projectId': 8,
     u'score': 4.44,
     u'userId': 1},
    u'_type': u'score'},
   {u'_id': u'AWMjo_RVKiw5AfXZ8guz',
    u'_index': u'weserve',
    u'_score': 1.0,
    u'_source': {u'epochtime': 1479663225000,
     u'projectId': 9,
     u'score': 1.04,
     u'userId': 1},
    u'_type': u'score'}],
  u'max_score': 1.0,
  u'total': 14080},
 u'timed_out': False,
 u'took': 77}

In [10]:
es.count(index="weserve", doc_type="score", q="epochtime:[2016-01-01 TO 2016-02-01]")
#es.search(index="weserve", doc_type="score", q="projectId:14", size=10)

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5}, u'count': 415}

In [12]:
# write projectInfo data
projectInfo.write.format("es").option("es.mapping.id", "id").save("weserve/projects")
# check load went ok
print("projectInfo DF count: %d" % projectInfo.count())
print("ES index count: %d" % es.count(index="weserve", doc_type="projects")['count'])

projectInfo DF count: 128
ES index count: 128


In [13]:
es.search(index="weserve", doc_type="projects", q="id:[8 TO 10]")

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
 u'hits': {u'hits': [{u'_id': u'8',
    u'_index': u'weserve',
    u'_score': 1.0,
    u'_source': {u'beneficiaries': u'Primary beneficiaries: 150 in 2011, 550 in 2012, and 1000 in 2013 ',
     u'contact': u'/ sarbani.chakravarty@cii.in / 0124-401 4535 (D), ',
     u'funding': u'INR 30,40,000 (in 2012) ',
     u'id': 8,
     u'imageUrl': u'https://source.unsplash.com/ViEBSoZH6M4',
     u'name': u'Broadening access to tertiary education ',
     u'need': u'There is a systemic bias against poor students gaining admission to and succeeding at Indias premier universities. Students from poor homes have limited access to additional coaching for competitive exams due to the high fees charged for such tutoring, a',
     u'ngoName': u'Avanti Fellows ',
     u'ngoUserId': 106,
     u'region': u'Mumbai, Chennai, Delhi, Kanpur ',
     u'scope': u'Avanti Fellows identifies bright high school students from poor homes and provides them mentor

### Load Elastic Search data to Spark DataFrame

In [14]:
#This data already in score DF, but still trying to create DF by reading from ES
score_from_es = spark.read.format("es").load("weserve/score")
score_from_es.show(5)

+-------------------+---------+-----+------+
|          epochtime|projectId|score|userId|
+-------------------+---------+-----+------+
|2015-06-21 01:42:48|        2| 3.93|     1|
|2015-12-23 02:33:44|       24| 4.68|     1|
|2017-08-10 02:29:32|       25| 1.41|     1|
|2018-01-31 01:06:31|       27| 3.46|     1|
|2015-09-09 06:06:36|       29| 3.71|     1|
+-------------------+---------+-----+------+
only showing top 5 rows



In [15]:
#This data already in score DF, but still trying to create DF by reading from ES
proj_from_es = spark.read.format("es").load("weserve/projects")
proj_from_es.show(5)

+------+--------------------+--------------------+--------------------+---+--------------------+--------------------+--------------------+--------------------+---------+--------------------+--------------------+
|@model|       beneficiaries|             contact|             funding| id|            imageUrl|                name|                need|             ngoName|ngoUserId|              region|               scope|
+------+--------------------+--------------------+--------------------+---+--------------------+--------------------+--------------------+--------------------+---------+--------------------+--------------------+
|  null|              1200+ |jawahar.k@cii.in ...|      INR 20,65,400 |  5|https://source.un...|Seed your Dreams ...|The backbone of t...|New Resolution In...|       66|        Maharashtra |This programme ho...|
|  null|Primary beneficia...|/ sarbani.chakrav...|INR 30,40,000 (in...|  8|https://source.un...|Broadening access...|There is a system...|     Avanti Fe

In [16]:
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col
als = ALS(userCol="userId", itemCol="projectId", ratingCol="score", regParam=0.01, rank=20, seed=12)
model = als.fit(score_from_es)
model.userFactors.show(5)
model.itemFactors.show(5)

+---+--------------------+
| id|            features|
+---+--------------------+
| 10|[0.10417445, 0.31...|
| 20|[0.14441033, 0.44...|
| 30|[-0.13521, 0.2008...|
| 40|[-0.14175962, -0....|
| 50|[0.62669754, 0.46...|
+---+--------------------+
only showing top 5 rows

+---+--------------------+
| id|            features|
+---+--------------------+
| 10|[0.7050928, -0.61...|
| 20|[0.81325096, -0.0...|
| 30|[0.47546232, 0.25...|
| 40|[1.1653336, 0.027...|
| 50|[0.38668287, 0.49...|
+---+--------------------+
only showing top 5 rows



In [17]:
from pyspark.sql.types import *
from pyspark.sql.functions import udf, lit, current_timestamp, unix_timestamp

def convert_vector(x):
    '''Convert a list or numpy array to delimited token filter format'''
    return " ".join(["%s|%s" % (i, v) for i, v in enumerate(x)])

def reverse_convert(s):
    '''Convert a delimited token filter format string back to list format'''
    return  [float(f.split("|")[1]) for f in s.split(" ")]

def vector_to_struct(x, version, ts):
    '''Convert a vector to a SparkSQL Struct with string-format vector and version fields'''
    return (convert_vector(x), version, ts)

vector_struct = udf(vector_to_struct, \
                    StructType([StructField("factor", StringType(), True), \
                                StructField("version", StringType(), True),\
                                StructField("timestamp", LongType(), True)]))

In [18]:
# test out the vector conversion function
test_vec = model.userFactors.select("features").first().features
print(test_vec)
print()
print(convert_vector(test_vec))

[0.10417445003986359, 0.3149499297142029, -0.06789375841617584, 0.26035580039024353, 0.22476743161678314, 0.25926652550697327, -0.28272002935409546, -0.6264508366584778, 0.1813039630651474, -0.7453020811080933, -0.4947234094142914, -0.20996421575546265, -0.16069133579730988, 0.14927813410758972, 0.242600217461586, -0.3454575836658478, -1.3579528331756592, -0.0057459017261862755, -0.15920588374137878, 0.14256320893764496]
()
0|0.10417445004 1|0.314949929714 2|-0.0678937584162 3|0.26035580039 4|0.224767431617 5|0.259266525507 6|-0.282720029354 7|-0.626450836658 8|0.181303963065 9|-0.745302081108 10|-0.494723409414 11|-0.209964215755 12|-0.160691335797 13|0.149278134108 14|0.242600217462 15|-0.345457583666 16|-1.35795283318 17|-0.00574590172619 18|-0.159205883741 19|0.142563208938


In [19]:
ver = model.uid
ts = unix_timestamp(current_timestamp())
project_vectors = model.itemFactors.select("id", vector_struct("features", lit(ver), ts).alias("@model"))
project_vectors.select("id", "@model.factor", "@model.version", "@model.timestamp").show(5)
user_vectors = model.userFactors.select("id", vector_struct("features", lit(ver), ts).alias("@model"))
user_vectors.select("id", "@model.factor", "@model.version", "@model.timestamp").show(5)

+---+--------------------+--------------------+----------+
| id|              factor|             version| timestamp|
+---+--------------------+--------------------+----------+
| 10|0|0.705092787743 ...|ALS_4c8f80826b319...|1525311449|
| 20|0|0.81325095892 1...|ALS_4c8f80826b319...|1525311449|
| 30|0|0.475462317467 ...|ALS_4c8f80826b319...|1525311449|
| 40|0|1.16533362865 1...|ALS_4c8f80826b319...|1525311449|
| 50|0|0.386682868004 ...|ALS_4c8f80826b319...|1525311449|
+---+--------------------+--------------------+----------+
only showing top 5 rows

+---+--------------------+--------------------+----------+
| id|              factor|             version| timestamp|
+---+--------------------+--------------------+----------+
| 10|0|0.10417445004 1...|ALS_4c8f80826b319...|1525311450|
| 20|0|0.144410327077 ...|ALS_4c8f80826b319...|1525311450|
| 30|0|-0.135210007429...|ALS_4c8f80826b319...|1525311450|
| 40|0|-0.141759619117...|ALS_4c8f80826b319...|1525311450|
| 50|0|0.626697540283 ...|ALS_4

In [20]:
project_vectors.show(12)

+---+--------------------+
| id|              @model|
+---+--------------------+
| 10|[0|0.705092787743...|
| 20|[0|0.81325095892 ...|
| 30|[0|0.475462317467...|
| 40|[0|1.16533362865 ...|
| 50|[0|0.386682868004...|
| 60|[0|0.304642319679...|
| 70|[0|0.19795666635 ...|
| 80|[0|0.505135059357...|
| 90|[0|0.354219198227...|
|100|[0|1.75060594082 ...|
|110|[0|0.856565415859...|
|120|[0|0.946621417999...|
+---+--------------------+
only showing top 12 rows



In [21]:
projectInfo.show(12)

+---+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| id|ngoUserId|             ngoName|              region|                name|               scope|                need|       beneficiaries|             funding|             contact|            imageUrl|
+---+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  1|        7|Jayaprakash Naray...|Noida, Uttar Prad...|Vidya & Child: Ar...|Exposure to art i...|The JP Narayan Tr...|                250 |       INR 4,94,000 |jawahar.k@cii.in ...|https://source.un...|
|  2|       87| Deep Griha Society |3 urban slums: Pu...|Creches for worki...|This project is d...|The benefits of t...|300 families; ie:...|      INR 18,72,433 |/ sarbani.chakrav.

In [22]:
# write data to ES, use:
# - "id" as the column to map to ES project id
# - "update" write mode for ES, since you want to update new fields only
project_vectors.write.format("es") \
    .option("es.mapping.id", "id") \
    .option("es.write.operation", "update") \
    .save("weserve/projects", mode="append")

In [23]:
# write data to ES, use:
# - "id" as the column to map to ES project id
# - "index" write mode for ES, since you have not written to the user index previously
# - "append" write mode for Spark
user_vectors.write.format("es") \
    .option("es.mapping.id", "id") \
    .option("es.write.operation", "index") \
    .save("weserve/users", mode="append")

In [24]:
# search project
es.search(index="weserve", doc_type="projects", q="id:[11 TO 12]", size=2)

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
 u'hits': {u'hits': [{u'_id': u'12',
    u'_index': u'weserve',
    u'_score': 1.0,
    u'_source': {u'@model': {u'factor': u'0|1.21462976933 1|0.0590036734939 2|0.34807151556 3|0.204049572349 4|-0.172221601009 5|-0.0699132084846 6|-0.0970888361335 7|0.0661358982325 8|1.6382496357 9|0.630910098553 10|0.652237951756 11|-0.482777565718 12|-0.550349533558 13|-0.699180662632 14|0.257830500603 15|-0.964305818081 16|-2.44421744347 17|-0.0355373956263 18|0.59566283226 19|0.0523681528866',
      u'timestamp': 1525311470,
      u'version': u'ALS_4c8f80826b319208ec9e'},
     u'beneficiaries': u'100 + ',
     u'contact': u'/ sarbani.chakravarty@cii.in / 0124-401 4535 (D), ',
     u'funding': u'INR 32,22,022 ',
     u'id': 12,
     u'imageUrl': u'https://source.unsplash.com/z3jeIZHa3EY',
     u'name': u'Rehabilitation centre for street children ',
     u'need': u'This project is for the livelihood of children and to stint their wrong habi

In [25]:
# search user
es.search(index="weserve", doc_type="users", q="id:[11 TO 12]", size=2)

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
 u'hits': {u'hits': [{u'_id': u'12',
    u'_index': u'weserve',
    u'_score': 1.0,
    u'_source': {u'@model': {u'factor': u'0|-0.083082921803 1|-0.319834560156 2|0.0599472895265 3|-0.139686107635 4|0.419433891773 5|-0.0520416945219 6|-0.496550351381 7|-0.00555573543534 8|0.142739087343 9|0.0387709625065 10|-0.309721589088 11|-0.117304287851 12|0.107994966209 13|0.135474815965 14|-0.25818040967 15|0.195693925023 16|-0.867733597755 17|-0.438328832388 18|0.393337726593 19|1.44868993759',
      u'timestamp': 1525311476,
      u'version': u'ALS_4c8f80826b319208ec9e'},
     u'id': 12},
    u'_type': u'users'},
   {u'_id': u'11',
    u'_index': u'weserve',
    u'_score': 1.0,
    u'_source': {u'@model': {u'factor': u'0|0.0676947906613 1|0.518737137318 2|0.284130424261 3|0.911633372307 4|-0.506257295609 5|-0.0785790607333 6|0.712992608547 7|-0.151328489184 8|-0.0813689157367 9|0.317590296268 10|0.199215218425 11|0.319846510887 12|-0

# Get recommendations

In [26]:
from IPython.display import Image, HTML, display
 
def fn_query(query_vec, q="*", cosine=False):
    """
    Construct an Elasticsearch function score query.
    
    The query takes as parameters:
        - the field in the candidate document that contains the factor vector
        - the query vector
        - a flag indicating whether to use dot product or cosine similarity (normalized dot product) for scores
        
    The query vector passed in will be the user factor vector (if generating recommended projects for a user)
    or project factor vector (if generating similar projects for a given project)
    """
    return {
    "query": {
        "function_score": {
            "query" : { 
                "query_string": {
                    "query": q
                }
            },
            "script_score": {
                "script": {
                        "inline": "payload_vector_score",
                        "lang": "native",
                        "params": {
                            "field": "@model.factor",
                            "vector": query_vec,
                            "cosine" : cosine
                        }
                    }
            },
            "boost_mode": "replace"
        }
    }
}


def get_similar(the_id, q="*", num=10, index="weserve", dt="projects"):
    """
    Given a project id, execute the recommendation function score query to find similar projects, ranked by cosine similarity
    """
    response = es.get(index=index, doc_type=dt, id=the_id)
    src = response['_source']
    if '@model' in src and 'factor' in src['@model']:
        raw_vec = src['@model']['factor']
        # our script actually uses the list form for the query vector and handles conversion internally
        query_vec = reverse_convert(raw_vec)
        q = fn_query(query_vec, q=q, cosine=True)
        results = es.search(index, dt, body=q)
        hits = results['hits']['hits']
        return src, hits[1:num+1]
    
    
def get_user_recs(the_id, q="*", num=10, index="weserve"):
    """
    Given a user id, execute the recommendation function score query to find top projects, ranked by predicted rating
    """
    response = es.get(index=index, doc_type="users", id=the_id)
    src = response['_source']
    if '@model' in src and 'factor' in src['@model']:
        raw_vec = src['@model']['factor']
        # our script actually uses the list form for the query vector and handles conversion internally
        query_vec = reverse_convert(raw_vec)
        q = fn_query(query_vec, q=q, cosine=False)
        results = es.search(index, "projects", body=q)
        hits = results['hits']['hits']
        return src, hits[:num]

def get_projects_for_user(the_id, num=10, index="weserve"):
    """
    Given a user id, get the projects rated by that user, from highest- to lowest-rated.
    """
    response = es.search(index=index, doc_type="score", q="userId:%s" % the_id, size=num, sort=["score:desc"])
    hits = response['hits']['hits']
    ids = [h['_source']['projectId'] for h in hits]
    projects = es.mget(body={"ids": ids}, index=index, doc_type="projects", _source_include=['name'])
    project_hits = projects['docs']
    proj_names = [h['_source'] for h in project_hits]
    return proj_names

            
def display_user_recs(the_id, q="*", num=10, num_last=10, index="weserve"):
    user, recs = get_user_recs(the_id, q, num, index)
    user_projects = get_projects_for_user(the_id, num_last, index)
        
    # display the projcts that this user has rated highly
    display(HTML("<h2>Get recommended projects for user id %s</h2>" % the_id))
    display(HTML("<h4>The user has rated the following projects highly:</h4>"))
    user_html = "<table border=0>"
    i = 0
    for proj in user_projects:
        project_name = proj['name']
        user_html += "<td><h5>%s</h5></td>" % (project_name)
        i += 1
        if i % 5 == 0:
            user_html += "</tr><tr>"
    user_html += "</tr></table>"
    display(HTML(user_html))
    # now display the recommended projects for the user
    display(HTML("<br>"))
    display(HTML("<h2>User may also like:</h2>"))
    rec_html = "<table border=0>"
    i = 0
    for rec in recs:
        r_score = rec['_score']
        r_title = rec['_source']['name']
        rec_html += "<td><h5>%s</h5></td><td><h5>%2.3f</h5></td>" % (r_title, r_score)
        i += 1
        if i % 5 == 0:
            rec_html += "</tr><tr>"
    rec_html += "</tr></table>"
    display(HTML(rec_html))

    
def display_similar(the_id, q="*", num=10, index="weserve", dt="projects"):
    """
    Display projects, together with similar projects and similarity scores, in a table
    """
    project, recs = get_similar(the_id, q, num, index, dt)
        
    display(HTML("<h2>Get similar projects for:</h2>"))
    display(HTML("<h4>%s</h4>" % project['name']))
    display(HTML("<br>"))
    display(HTML("<h2>People who joined this project also joined these:</h2>"))
    sim_html = "<table border=0>"
    i = 0
    for rec in recs:
        r_score = rec['_score']
        r_title = rec['_source']['name']
        sim_html += "<td><h5>%s</h5></td><td><h5>%2.3f</h5></td>" % (r_title, r_score)
        i += 1
        if i % 5 == 0:
            sim_html += "</tr><tr>"
    sim_html += "</tr></table>"
    display(HTML(sim_html))

In [27]:
# find similar projects
display_similar(100, num=5)

In [29]:
display_similar(100, num=5, q="name:(NOT Psychotherapy)")

In [32]:
display_user_recs(12, num=5, num_last=5)

In [33]:
display_user_recs(12, num=5, num_last=5, q="id:[21 TO 30]")