In [1]:
################################################
# ReRank
#
# Step 1: Import two libries: cohere and weaviate
# Step 2: Apply Dense Retrieval to a query
# Step 3: Improving Keyword Search with ReRank
# Step 4: Improving Dense Retrieval with ReRank
################################################

# ## Setup
# 
# Load needed API keys and relevant Python 
# libaries.

# !pip install cohere 
# !pip install weaviate-client

In [2]:
import os


In [3]:
%env COHERE_API_KEY=<key>
%env WEAVIATE_API_KEY=<key>
%env OPENAI_APIKEY=<key>

env: COHERE_API_KEY=Iy35XHurcf43TF1BP4D1NBi3jnqPkGeXGxPYtMWi
env: WEAVIATE_API_KEY=2kyTi1kzwEM3xd3AWCoD51WkivBVpaCZIhWc
env: OPENAI_APIKEY=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


In [5]:
%env WEAVIATE_API_URL=<your-url>

env: WEAVIATE_API_URL=https://rerank-kew6y20k.weaviate.network


In [6]:
################################################
# 1. Import two libries
################################################


################################################
# 1.1 Import cohere
################################################
import cohere
co = cohere.Client(os.getenv('COHERE_API_KEY'))

In [7]:
################################################
# 1.2 Import weaviate
################################################

import weaviate
auth_config = weaviate.auth.AuthApiKey(
    api_key=os.getenv('WEAVIATE_API_KEY'))


In [8]:
# In[74]:


client = weaviate.Client(
    url="https://rerank-kew6y20k.weaviate.network",
    auth_client_secret=auth_config,
    additional_headers={
        "X-Cohere-Api-Key": 
        os.getenv('COHERE_API_KEY'),
    "X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY")
    }
)

            Consider upgrading to the new and improved v4 client instead!
            See here for usage: https://weaviate.io/developers/weaviate/client-libraries/python
            


In [None]:
# client.schema.delete_all()  # ⚠️ uncomment to start from scratch by deleting ALL data

# ===== Create Article class for the schema =====
article_class = {
    "class": "Article",
    "description": "An article from the Simple English Wikipedia data set",
    "vectorizer": "text2vec-openai",
    "moduleConfig": {
        # Match how OpenAI created the embeddings for the `content` (`text`) field
        "text2vec-openai": {
            "model": "ada",
            "modelVersion": "002",
            "type": "text",
            "vectorizeClassName": False
        }
    },
    "properties": [
        {
            "name": "title",
            "description": "The title of the article",
            "dataType": ["text"],
            # Don't vectorize the title
            "moduleConfig": {"text2vec-openai": {"skip": True}}
        },
        {
            "name": "content",
            "description": "The content of the article",
            "dataType": ["text"],
        }
    ]
}

# Add the Article class to the schema
client.schema.create_class(article_class)
print('Created schema');

In [None]:
# ===== Import data =====
# Settings for displaying the import progress
counter = 0
interval = 100  # print progress every this many records

# Create a pandas dataframe iterator with lazy-loading,
# so we don't load all records in RAM at once.
import pandas as pd
csv_iterator = pd.read_csv(
    'vector_database_wikipedia_articles_embedded.csv',
    usecols=['id', 'url', 'title', 'text', 'content_vector'],
    chunksize=100,  # number of rows per chunk
    # nrows=350  # optionally limit the number of rows to import
)

# Iterate through the dataframe chunks and add each CSV record to the batch
import ast
client.batch.configure(batch_size=100)  # Configure batch
with client.batch as batch:
  for chunk in csv_iterator:
      for index, row in chunk.iterrows():

          properties = {
              "title": row.title,
              "content": row.text,
              "url": row.url
          }

          # Convert the vector from CSV string back to array of floats
          vector = ast.literal_eval(row.content_vector)

          # Add the object to the batch, and set its vector embedding
          batch.add_data_object(properties, "Article", vector=vector)

          # Calculate and display progress
          counter += 1
          if counter % interval == 0:
              print(f"Imported {counter} articles...")
print(f"Finished importing {counter} articles.")

In [9]:
client.is_ready()

True

In [10]:
from utils import dense_retrieval, print_result, keyword_search

In [11]:
# pip install -U weaviate-client

In [12]:
################################################
# 2. Dense Retrieval
################################################
from utils import dense_retrieval


query = "What is the capital of India?"


################################################
# 2.1 Apply Dense Retrieval to a query
################################################
dense_retrieval_results = dense_retrieval(query,client)


In [13]:
from utils import print_result

################################################
# 2.2 Print the result of the Dense Retrieval to 
#     a query
################################################
print(dense_retrieval_results)

[{'content': 'New Delhi () is the capital of India and a union territory of the megacity of Delhi. It has a very old history and is home to several monuments where the city is expensive to live in. In traditional Indian geography, it falls under the North Indian zone. The city has an area of about . New Delhi has a population of about 9.4 Million people.\n\nReferences', 'title': 'New Delhi'}, {'content': 'The Republic of India is divided into twenty-eight States and eight union territories, including the National Capital Territory.\n\nStates and capitals \n\nUnion territories:\n\n \nIndia-related lists', 'title': 'States and union territories of India'}, {'content': "Delhi (;  Dillī;  Dillī;  Dēhlī), officially the National Capital Territory of Delhi (NCT), is a territory in India. It includes the country's capital New Delhi. It covers an area of . It is bigger than the Faroe Islands but smaller than Guadeloupe. Delhi is a part of the National Capital Region, which has 12.5 million res

In [14]:
################################################
# 3. Improving Keyword Search with ReRank
################################################

from utils import keyword_search


query_1 = "What is the capital of India?"

In [15]:
################################################
# 3.1 Keyword Search with 3 results
################################################

results = keyword_search(query_1,
  client,
  properties=["title", "content"],
  num_results=3
  )

for i, result in enumerate(results):
    print(f"i:{i}")
    print(result.get('title'))
    print(result.get('text'))

i:0
Itanagar
None
i:1
North India
None
i:2
India
None


In [27]:
################################################
# 3.2 Keyword Search with 500 results
################################################
query_1 = "What is the capital of India?"
results = keyword_search(query_1,
   client,
   properties=["title", "content"],
   num_results=500
   )

for i, result in enumerate(results):
    print(f"i:{i}")
    print(result.get('title'))
    print(result.get('content'))

i:0
Itanagar
Itanagar is the capital of the Indian  state of Arunachal Pradesh.

Ita Fort is one of the most important historical sites in the state of Arunachal Pradesh.

Capital cities in India
Arunachal Pradesh
i:1
North India
Northern India, also known as Hindustan, is a land and cultural region of India. In traditional Indian geography, India is pieced into five major zones: North India, North-east India, East India, West India and South India.

As a linguistic-cultural and political region, North India consists of six Indian states: Himachal Pradesh, Uttarakhand, Haryana, Punjab, Uttar Pradesh and Rajasthan also The National Capital Territory of Delhi is also a part of northern India. It shares some of its cultural, historical, musical, and linguistic heritage with neighboring Pakistan and Jammu and Kashmir, which was part of British India empire prior to the 1947 Independence. The North Indian Plain is a large part of it.

Related pages
Church of North India

Other websites
 Amp

In [28]:
################################################
# 3.3 ReRank of the Keyword Search results
################################################
def rerank_responses(query, responses,num_responses=10):
    reranked_responses = co.rerank(
        model = 'rerank-english-v2.0',
        query = query,
        documents = responses,
        top_n = num_responses,
        )
    return reranked_responses

In [29]:
texts = [result.get('content') for result in 
         results]
print(texts)
reranked_text = rerank_responses(query_1, 
         texts)

for i, rerank_result in enumerate(reranked_text):
    print(f"i:{i}")
    print(f"{rerank_result}")
    print()

['Itanagar is the capital of the Indian  state of Arunachal Pradesh.\n\nIta Fort is one of the most important historical sites in the state of Arunachal Pradesh.\n\nCapital cities in India\nArunachal Pradesh', 'Northern India, also known as Hindustan, is a land and cultural region of India. In traditional Indian geography, India is pieced into five major zones: North India, North-east India, East India, West India and South India.\n\nAs a linguistic-cultural and political region, North India consists of six Indian states: Himachal Pradesh, Uttarakhand, Haryana, Punjab, Uttar Pradesh and Rajasthan also The National Capital Territory of Delhi is also a part of northern India. It shares some of its cultural, historical, musical, and linguistic heritage with neighboring Pakistan and Jammu and Kashmir, which was part of British India empire prior to the 1947 Independence. The North Indian Plain is a large part of it.\n\nRelated pages\nChurch of North India\n\nOther websites\n Ampur: A Virtu

In [30]:
################################################
# 4. Improving Dense Retrieval with ReRank
################################################


from utils import dense_retrieval
query_2 = "Who is the tallest person in history?"

In [31]:
################################################
# 4.1 Dense Retrieval of a new query
################################################
results = dense_retrieval(query_2,client)


for i, result in enumerate(results):
    print(f"i:{i}")
    print(result.get('title'))
    print(result.get('content'))
    print()

i:0
Robert Wadlow
Robert Pershing Wadlow (February 22, 1918 – July 15, 1940) was the tallest person who ever lived.

Early life
Robert Pershing Wadlow was born to Addie Johnson and Harold Wadlow in Alton, Illinois on February 22, 1918, and was the oldest of five children. During elementary school, they had to make a special desk for him because of his size. In 1936, after graduating from Alton High School, he enrolled in Shurtleff College with the intention of studying law.

Height
Robert Wadlow was normal at birth but started growing abnormally when he was two, after a double hernia operation. He was six feet tall at age six. By the time he was 17, he was eight feet tall. Because he was so tall, he got lots of attention and became very famous, but his bones were very weak and he had to wear leg braces. In 1940, he was walking in a Fourth of July parade, when one of his braces made his ankle get infected; he died on the 15th of that month. When he died he was 8' 11.1". Robert was 6' 5"

In [32]:
################################################
# 4.2 ReRank the Dense Retrieval of a 
#     new query
################################################
texts = [result.get('content') for result 
         in results]
reranked_text = rerank_responses(query_2, 
         texts)
for i, rerank_result in enumerate(
        reranked_text):
    print(f"i:{i}")
    print(f"{rerank_result}")
    print()

i:0
('id', 'bd0a4ae5-299b-4f33-80be-b10aff6bcc0e')

i:1
('results', [RerankResponseResultsItem(document=None, index=0, relevance_score=0.95945925), RerankResponseResultsItem(document=None, index=1, relevance_score=0.8264318), RerankResponseResultsItem(document=None, index=2, relevance_score=0.5664982), RerankResponseResultsItem(document=None, index=3, relevance_score=0.5366772), RerankResponseResultsItem(document=None, index=4, relevance_score=0.030559862)])

i:2

