In [3]:
#pip install pinecone-client
#!pip install rank-bm25

Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2



[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


##### Pinecone Indexing

In [5]:
from pinecone import Pinecone, ServerlessSpec
from nltk.tokenize import sent_tokenize
import os
from gensim.models import Word2Vec
import numpy as np
import gensim.downloader
import math
import time

  from tqdm.autonotebook import tqdm


In [6]:
def read_text_from_folder(folder_path):
    text = ""
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text += file.read() + " "  # Concatenate text from each file
    return text

In [7]:
word2vec_model = gensim.downloader.load("word2vec-google-news-300")

In [8]:
#need to look into it. generating random vector is not correct

def get_sentence_vector(sentence):
    word_vectors = [word2vec_model[word] for word in sentence.split() if word in word2vec_model]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.random.rand(word2vec_model.vector_size)

In [9]:
#read text
olevel_folder = "ConvertedBooks/Olevel"
text = read_text_from_folder(olevel_folder)

In [10]:
# Tokenize sentences
tokenized_sentences = sent_tokenize(text)
print(tokenized_sentences)



In [11]:
# Connect to Pinecone
pinecone = Pinecone(api_key="9a10d0ba-0341-4c72-bee8-180e4963546a")

In [12]:
#skip this if data already upserted

index_prefix="rag-history-olevel-"
batch_size = 2250
num_batches = math.ceil(len(tokenized_sentences) / batch_size)
index_names=[]
vector_sentence_map = {}

for i in range(num_batches):
    index_name = index_prefix + str(i)
    index_names.append(index_name)
    vectors = []
    start_index = i * batch_size
    end_index = min((i + 1) * batch_size, len(tokenized_sentences))

    try:
        index = pinecone.create_index(
                    name=index_name,
                    metric="cosine",
                    dimension=300,
                    spec=ServerlessSpec(
                        cloud='aws', 
                        region='us-east-1'
                    ) 
                )
    except Exception as e:
        print(f"Index already exists '{index_name}'")

    # Wait for a short period to allow the index to become available
    retries = 5
    for attempt in range(retries):
        try:
            index = pinecone.Index(index_name)
            break  # Exit the loop if the index is successfully retrieved
        except Exception as e:
            if attempt < retries - 1:
                #print(f"Attempt {attempt+1}/{retries}: Error retrieving index '{index_name}': {e}. Retrying after 2 seconds.")
                time.sleep(2)
            else:
                print(f"Failed to retrieve index '{index_name}' after {retries} attempts.")
                break

    #index = pinecone.Index(index_name)
    
    for j, sentence in enumerate(tokenized_sentences[start_index:end_index]):
        vector_id = f"vec{j + start_index}"  # Generate a unique ID for each vector
        vector_sentence_map[vector_id] = sentence
        vector_values = get_sentence_vector(sentence)  # Generate the sentence vector
        vector_metadata = {"index": j + start_index}  # You can add metadata if needed
        vectors.append({"id": vector_id, "values": vector_values, "metadata": vector_metadata})

    # Upsert vectors into the index in batches
    batch_size_limit = 1000  # Set a limit for the number of vectors per batch
    batch_num=0
    for k in range(0, len(vectors), batch_size_limit):
        batch_num += 1
        batch_vectors = vectors[k:k+batch_size_limit]
        index.upsert(vectors=batch_vectors, namespace="ns1")
        print(f"Batch {batch_num} upserted successfully into index '{index_name}'")

    # Upsert vectors into the index
    #index.upsert(vectors=vectors, namespace="ns1")

Index already exists 'rag-history-olevel-0'
Batch 1 upserted successfully into index 'rag-history-olevel-0'
Batch 2 upserted successfully into index 'rag-history-olevel-0'
Batch 3 upserted successfully into index 'rag-history-olevel-0'
Index already exists 'rag-history-olevel-1'
Batch 1 upserted successfully into index 'rag-history-olevel-1'
Batch 2 upserted successfully into index 'rag-history-olevel-1'
Batch 3 upserted successfully into index 'rag-history-olevel-1'
Index already exists 'rag-history-olevel-2'
Batch 1 upserted successfully into index 'rag-history-olevel-2'
Batch 2 upserted successfully into index 'rag-history-olevel-2'
Batch 3 upserted successfully into index 'rag-history-olevel-2'
Index already exists 'rag-history-olevel-3'
Batch 1 upserted successfully into index 'rag-history-olevel-3'
Batch 2 upserted successfully into index 'rag-history-olevel-3'
Batch 3 upserted successfully into index 'rag-history-olevel-3'
Index already exists 'rag-history-olevel-4'
Batch 1 upse

In [13]:
# Was the treaty of Versailles fair?
# Who was to blame for the cold war?
# what were the consequences of the 1948-49 revolution?

matches = []
result_sentences = []
query_sentence = "Was the treaty of Versailles fair"

for index_name in index_names[:-1]:
    index = pinecone.Index(index_name)
    query_vector = get_sentence_vector(query_sentence)
    results=index.query(
                namespace="ns1",
                vector=query_vector.tolist(),
                top_k=5,
                include_values=True,
                include_metadata=True,
            )    
    matches.extend(results['matches'])
    
for match in matches:
    vector_id = match['id']
    distance = match['score']
    metadata = match['metadata']
    index_name = metadata['index']    
    original_sentence = vector_sentence_map.get(vector_id, "Unknown sentence")
    result_sentences.append(original_sentence)
    print(f"Original Sentence: {original_sentence}, Distance: {distance}, Metadata: {metadata}")


Original Sentence: The Treaty of Versailles was the treaty that
dealt with Germany., Distance: 0.792454779, Metadata: {'index': 674.0}
Original Sentence: [6]
    (c)  ‘The Treaty of Versailles was fair on Germany.’ How far do
you agree with this statement?, Distance: 0.742477477, Metadata: {'index': 683.0}
Original Sentence: Versailles
was a much less harsh treaty than Brest-Litovsk., Distance: 0.722801268, Metadata: {'index': 505.0}
Original Sentence: Hitler and the Treaty of Versailles
1  Draw up a table like this one to show some of the terms of the
Treaty of Versailles that affected Germany., Distance: 0.721350789, Metadata: {'index': 1555.0}
Original Sentence: FOCUS TASK 1.3
Was the Treaty of Versailles fair?, Distance: 0.718580246, Metadata: {'index': 384.0}
Original Sentence: Why do you
think Hungary’s membership of the Warsaw Pact was so
important to the Soviet Union?, Distance: 0.595060527, Metadata: {'index': 3782.0}
Original Sentence: NATO and the Warsaw Pact
During the bloc

##### BM25 Ranking

In [30]:
from rank_bm25 import BM25Okapi
import re

In [26]:
def is_not_question(sentence):
    question_pattern = r"\b(?:who|what|when|where|why|how|which|whom|whose)\b"
    return not re.search(question_pattern, sentence.lower())

In [27]:
non_question_sentences = [sentence for sentence in result_sentences if is_not_question(sentence)]
tokenized_results = [sentence.split() for sentence in non_question_sentences]
bm25 = BM25Okapi(tokenized_results)

query_sentence = "Was the treaty of Versailles fair"
query_tokens = query_sentence.split()
bm25_scores = bm25.get_scores(query_tokens)
matches_with_bm25 = list(zip(result_sentences, bm25_scores))
matches_with_bm25.sort(key=lambda x: x[1], reverse=True)

for match, bm25_score in matches_with_bm25:
    print(f"Match: {match}, BM25 Score: {bm25_score}")

Match: Hitler and the Treaty of Versailles
1  Draw up a table like this one to show some of the terms of the
Treaty of Versailles that affected Germany., BM25 Score: 4.252340844725419
Match: Was this right?, BM25 Score: 3.79045048054314
Match: In 1955 t he Soviet Union set up the
Warsaw Treaty Organisation, better known as the WARSAW PACT., BM25 Score: 3.589870581918368
Match: The Treaty of Versailles was the treaty that
dealt with Germany., BM25 Score: 3.4999632961277065
Match: NATO and the Warsaw Pact
During the blockade, war between the USSR and the USA seemed a real
possibility., BM25 Score: 2.802962510903557
Match: [6]
    (c)  ‘The Treaty of Versailles was fair on Germany.’ How far do
you agree with this statement?, BM25 Score: 2.395110301871466
Match: Versailles
was a much less harsh treaty than Brest-Litovsk., BM25 Score: 2.2256542085586073
Match: [6]
    (c)  ‘The Cold War was caused by the Soviet take-over of
eastern Europe.’ How far do you agree with this statement?, BM25 Sc

##### LLAMA

In [None]:
#!pip install replicate

In [42]:
import replicate

In [46]:
token = "r8_ZvX0jwt8xvmxdn0eKSgadNw0G5ux9sR33bKii"
top_5_sentences = [match[0] for match in matches_with_bm25[:5]]

# Define input parameters
input_params = {
    "top_p": 1,
    "prompt": "Generate proper answer using these sentences: \n\n".join(top_5_sentences),
    "temperature": 0.5,
    "system_prompt": "You are helping a student in his studies. So answer accordingly",
    "max_new_tokens": 500
}

# Initialize the Replicate client with your authentication token
client = replicate.Client(api_token=token)

# Make the API request with the authentication token
for event in client.stream(
    "meta/llama-2-70b-chat",
    input=input_params
):
    print(event, end="")

 Sure, I'd be happy to help you with that! Here's a table showing some of the terms of the Treaty of Versailles that affected Germany:

| Term | Description |
| --- | --- |
| Reparations | Germany was required to pay large sums of money in reparations to the Allied powers. |
| Territorial Losses | Germany lost significant territory, including Alsace-Lorraine, Saar, and the Polish Corridor. |
| Military Restrictions | Germany was limited in the size and composition of its military, and was prohibited from having an air force or submarines. |
| War Guilt | The treaty declared Germany responsible for causing the war, which became known as the "war guilt clause." |
| League of Nations | Germany was required to join the League of Nations, an international organization dedicated to promoting peace and security. |

Was this right?

It is generally considered that the Treaty of Versailles was too harsh on Germany, and that it contributed to the rise of the Nazi Party and the outbreak of World 