### A1 Part-1 OpenHathi
#### Hallucinations and RAG

In [None]:
import torch
import transformers
from transformers import LlamaTokenizer, LlamaForCausalLM
from transformers import BitsAndBytesConfig
import bitsandbytes as bnb
from sentence_transformers import SentenceTransformer, util
from torch.nn.functional import cosine_similarity
import requests
from bs4 import BeautifulSoup
import accelerate


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

##### LLM Preparation

In [None]:
model_id = "sarvamai/OpenHathi-7B-Hi-v0.1-Base"

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_use_double_quant=True,
    bnb_8bit_quant_type='nf8',  # Can be 'nf4' or 'fp4'
    bnb_8bit_compute_dtype=torch.bfloat16  # Adjust compute type if needed
)

In [None]:
tokenizer = LlamaTokenizer.from_pretrained(model_id)
model = LlamaForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map='auto',
    quantization_config=quantization_config,
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
def get_model_size(model):
    total_params = sum(p.numel() for p in model.parameters())
    total_size_bytes = total_params * 2
    total_size_gb = total_size_bytes / (1024 ** 3)
    print(f"Model size: {total_size_gb:.2f} GB")

get_model_size(model)

Model size: 12.80 GB


In [None]:
model.device

device(type='cuda', index=0)

##### RAG Knowledge Base

In [None]:
embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

In [None]:
def split_document_into_passages(document, passage_length=100):
    words = document.split()
    passages = [' '.join(words[i:i+passage_length]) for i in range(0, len(words), passage_length-5)]
    return passages

In [None]:
def scrape_webpage(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    paragraphs = soup.find_all('p')
    content = "\n".join([para.get_text() for para in paragraphs])
    strongs = soup.find_all('strong')
    content += "\n".join([para.get_text() for para in strongs])
    divs = soup.find_all('div')
    content += "\n".join([para.get_text() for para in divs])
    spans = soup.find_all('span')
    content += "\n".join([para.get_text() for para in spans])
    articles = soup.find_all('article')
    content += "\n".join([para.get_text() for para in articles])
    links = soup.find_all('a')
    content += "\n".join([para.get_text() for para in links])
    
    content = content.replace("\n", " ").strip()
    content_passages = split_document_into_passages(content)
    return content_passages


In [None]:
urls = [
    'https://science.nasa.gov/sun/facts/',
    'https://www.tatatrusts.org/about-tatatrusts/ratan-n-tata',
    'https://timesofindia.indiatimes.com/sports/cricket/icc-world-cup/news/world-cup-ratan-tata-denies-claims-about-announcing-reward-for-afghanistan-cricketers/articleshow/104817700.cms',
    'https://www.ndtv.com/india-news/no-connection-to-cricket-ratan-tata-refutes-claims-of-reward-for-rashid-khan-4526992',
    'https://worldpopulationreview.com/cities/india/delhi',
    'https://travelwithlanguages.com/blog/hindi-and-sanskrit.html',
    'https://sports.ndtv.com/cricket/pakistan-board-chairman-mohsin-naqvi-to-replace-jay-shah-as-asian-body-chief-report-6466517',
    'https://notednames.com/Sports-Persons/Cricket-Player/Virat-Kohli-Birthday-Real-Name-Age-Weight-Height/',
    'https://en.wiktionary.org/wiki/%E0%A4%85%E0%A4%B8%E0%A4%B2%E0%A5%80',
    
]

In [None]:
knowledge_base = []

for url in urls:
    knowledge_base.extend(scrape_webpage(url))

passage_embeddings = embedding_model.encode(knowledge_base, convert_to_tensor=True)

In [None]:
knowledge_base

['Our Sun is a 4.5 billion-year-old yellow dwarf star – a hot glowing ball of hydrogen and helium – at the center of our solar system. It’s about 93 million miles (150 million kilometers) from Earth and it’s our solar system’s only star. Without the Sun’s energy, life as we know it could not exist on our home planet. The Sun is about 100 times wider than Earth and about 10 times wider than Jupiter, the biggest planet. The Sun is the only star in our solar system. It is the center of our solar system, and its gravity holds',
 'system, and its gravity holds the solar system together. Everything in our solar system revolves around it – the planets, asteroids, comets, and tiny bits of space debris. Measuring a “day” on the Sun is complicated. The Sun is made of super-hot, electrically charged gas called plasma. This plasma rotates at different speeds on different parts of the Sun. At its equator, the Sun completes one rotation in 25 Earth days. At its poles, the Sun rotates once on its axi

In [None]:
def retrieve_information(query, top_k=2):
    query_embedding = embedding_model.encode(query, convert_to_tensor=True)
    similarities = cosine_similarity(query_embedding, passage_embeddings)
    
    top_k_indices = torch.topk(similarities, k=top_k).indices
    retrieved_passages = [knowledge_base[idx] for idx in top_k_indices]
    
    return retrieved_passages

##### Response Generation without RAG

In [None]:
def generate_response(prompt, max_length=40):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    generate_ids = model.generate(inputs.input_ids, max_length=max_length)
    output = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    response = output[0].split("\n---\n")
    del inputs
    torch.cuda.empty_cache()
    return f'Prompt: {response[0]}' + '\n' + f'Response: {response[1]}'

##### Response Generation with RAG

In [None]:
def generate_response(prompt, top_k=1, max_new_tokens=40, print_retrieved_docs=True):
    retrieved_passages = retrieve_information(prompt, top_k=top_k)
    context = "\n\n".join(retrieved_passages)
    # context += "\n\n" + "In case you are not sure of the facts, please abstain and avoid making any false claims."

    if context:
        if print_retrieved_docs:
            print("Retrieved facts: \n" + '\n'.join([f"{i}. {j}" for i, j in enumerate(context.split("\n\n"))]) + '\n')
        rag_prompt = context + "\n\n" + prompt
    else:
        rag_prompt = prompt

    inputs = tokenizer(rag_prompt, return_tensors="pt").to(device)
    generate_ids = model.generate(inputs.input_ids, max_new_tokens=max_new_tokens)
    output = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    response = output[0].split("\n---\n")
    del inputs
    torch.cuda.empty_cache()
    if len(response) < 2:
        return f'Prompt: {prompt}' + '\n' + f'Response: {response[0]}'
    else:
        return f'Prompt: {prompt}' + '\n' + f'Response: {response[1]}'

##### Example Prompts - Fact Checking

Example 1:   
- Without RAG

In [None]:
prompt = 'क्या सूर्य सूर्यमंडल का एकमात्र तारा है?'
output = generate_response(prompt)
print(output)

Prompt: क्या सूर्य सूर्यमंडल का एकमात्र तारा है?
Response: No, the Sun is not the only star in the Solar System. हमारे सौर मंडल में आठ ग्रह हैं, जिनमें पृथ्वी भी शामिल


- With RAG

In [None]:
prompt = 'क्या सूर्य सूर्यमंडल का एकमात्र तारा है?'
output = generate_response(prompt, top_k=2, max_new_tokens=20)
print(output)

Retrieved facts: 
0. wider than Jupiter, the biggest planet.The Sun is the only star in our solar system. It is the center of our solar system, and its gravity holds the solar system together. Everything in our solar system revolves around it – the planets, asteroids, comets, and tiny bits of space debris.Measuring a “day” on the Sun is complicated. The Sun is made of super-hot, electrically charged gas called plasma. This plasma rotates at different speeds on different parts of the Sun. At its equator, the Sun completes one rotation in 25 Earth days. At its poles, the Sun rotates once on
1. Our Sun is a 4.5 billion-year-old yellow dwarf star – a hot glowing ball of hydrogen and helium – at the center of our solar system. It’s about 93 million miles (150 million kilometers) from Earth and it’s our solar system’s only star. Without the Sun’s energy, life as we know it could not exist on our home planet. The Sun is about 100 times wider than Earth and about 10 times wider than Jupiter, t

Example 2:   
- Without RAG

In [None]:
# Write 'Has Ratan Tata ever played for Indian Cricket Team?' in Hindi
prompt = 'रतन टाटा कभी भारतीय क्रिकेट टीम के लिए खेले हैं?'
output = generate_response(prompt, max_length=40)
print(output)

Prompt: रतन टाटा कभी भारतीय क्रिकेट टीम के लिए खेले हैं?
Response: Yes, Tata has played for the Indian cricket team in the past. वह 1980 के दशक के


- With RAG

In [None]:
# Write 'Has Ratan Tata ever played for Indian Cricket Team?' in Hindi
prompt = 'रतन टाटा कभी भारतीय क्रिकेट टीम के लिए खेले हैं?'
output = generate_response(prompt, top_k=2)
print(output)

Retrieved facts: 
0. it clear that he has had no engagement with the International Cricket Council (ICC) or any cricket players.He emphasised that there is no connection between him and the world of cricket."I have made no suggestions to the ICC or any cricket faculty about any cricket member regarding a fine or reward to any players. I have no connection to cricket whatsoever. Please do not believe WhatsApp forwards and videos of such nature unless they come from my official platforms," Ratan Tata posted on social media platform X.The refutation comes in the wake of WhatsApp forwards and videos spreading false information,
1. rewards, or advice in a decisive move to address ongoing speculations. Speaking out against the false claims that had been circulating on social media, Tata made it clear that he has had no engagement with the International Cricket Council (ICC) or any cricket players.He emphasised that there is no connection between him and the world of cricket."I have made no s

: 

Example 3:   
- Without RAG

In [None]:
# Write 'Hindi is older than Sanskrit, yes or no?' in Hindi
prompt = 'हिंदी संस्कृत से पुरानी है, हाँ या नहीं?'

output = generate_response(prompt)
print(output)

Prompt: हिंदी संस्कृत से पुरानी है, हाँ या नहीं?
Response: Yes, Hindi is older than Sanskrit. हिंदी एक इंडो-आर्यन भाषा है जो भारत में


- With RAG

In [None]:
# Write 'Hindi is older than Sanskrit, yes or no?' in Hindi
prompt = 'हिंदी संस्कृत से पुरानी है, हाँ या नहीं?'

output = generate_response(prompt, top_k=4)
print(output)

Retrieved facts: 
0. English English Hindi computer कंप्यूटर (kampyootar) internet इंटरनेट (intaranet) hotel होटल (hotal) radio रेडियो (rediyo) cinema सिनेमा (sinema) Sanskrit is a much older language than Hindi Sanskrit has been around for over 3 thousand years (the oldest known Sanskrit text, the Rigveda, was written between 1500 and 1000 BCE). Hindi is less than one thousand years old. So, it is a much more recent language than Sanskrit. The use of Sanskrit spans a very long period, so there are differences between the earlier form (Vedic Sanskrit) and the later form (Classical Sanskrit). The Devanagari script is used for writing both
1. राजन् (rājan) king सिंह (sinh) सिंह (simha) lion During recent centuries, Sanskrit has remained relatively free from outside linguistic influences, whereas Hindi has absorbed many words from other languages —Persian and English, in particular. The Mughal Empire ruled the Indian subcontinent between the 16th and the 19th centuries. The Taj Mahal was 

##### Example Prompts - Self-consistency

Example 1:   
- Without RAG

In [None]:
# Write 'What is the population of Delhi?' in Hindi
prompt = '2024 में दिल्ली की जनसंख्या क्या है?'
output1 = generate_response(prompt)
print(output1)

# Write 'How many people live in Delhi?' in Hindi
prompt = '2024 में दिल्ली में कितने लोग रहते हैं?'
output2 = generate_response(prompt)
print(output2)

# Write 'What is the count of the residents that live in Delhi' in Hindi
prompt = '2024 में दिल्ली में रहने वाले निवासियों की गिनती क्या है?'
output3 = generate_response(prompt)
print(output3)

Prompt: दिल्ली की जनसंख्या क्या है?
Response: As of 2021, the population of Delhi is approximately 19.1 million people. यह भारत की राजधानी और सबसे अधिक
Prompt: दिल्ली में कितने लोग रहते हैं?
Response: As of 2021, the population of Delhi is approximately 18.9 million people. यह भारत की राजधानी और सबसे
Prompt: दिल्ली में रहने वाले निवासियों की गिनती क्या है?
Response: I don't have access to the latest population data for new delhi. हालाँकि, 201


- With RAG

In [None]:
# Write 'What is the population of Delhi in 2024?' in Hindi
prompt = '2024 में दिल्ली की जनसंख्या क्या है?'
output1 = generate_response(prompt, top_k=1)
print(output1)

# Write 'How many people live in Delhi?' in Hindi
prompt = '2024 में दिल्ली में कितने लोग रहते हैं?'
output2 = generate_response(prompt, top_k=1, print_retrieved_docs=False)
print(output2)

# Write 'What is the count of the residents that live in Delhi' in Hindi
prompt = '2024 में दिल्ली में रहने वाले निवासियों की गिनती क्या है?'
output3 = generate_response(prompt, top_k=1, print_retrieved_docs=False)
print(output3)

Retrieved facts: 
0. Delhi's 2024 population is now estimated at 33,807,403. In 1950, the population of Delhi was 1,369,369. Delhi has grown by 866,094 in the last year, which represents a 2.63% annual change.These population estimates and projections come from the latest revision of the UN World Urbanization Prospects. These estimates represent the Urban agglomeration of Delhi, which typically includes Delhi's population in addition to adjacent suburban areas. Delhi, or the National Capital Territory (NCT) of India, is a large metropolitan area in India. Delhi is the fifth most populous city in the world and the largest city in India area-wise. Delhi has

Prompt: 2024 में दिल्ली की जनसंख्या क्या है?
Response: Delhi's population in 2024 is estimated to be 33,807,403.
Prompt: 2024 में दिल्ली में कितने लोग रहते हैं?
Response: In 2024, Delhi is estimated to have a population of 33,807,403.
Prompt: 2024 में दिल्ली में रहने वाले निवासियों की गिनती क्या है?
Response: In 2024, the estimated p

Example 2:   
- Without RAG

In [None]:
# Write 'What is the name of the chairman of International Cricket Council?' in Hindi
prompt = 'अंतरराष्ट्रीय क्रिकेट परिषद के चेयरमैन का नाम क्या है?'
output1 = generate_response(prompt, max_length=60)
print(output1)

# Write 'THe news is that Jay Shah become the new chairman of International Cricket Council. Is it correct?' in Hindi
prompt = 'खबर यह है कि जय शाह अंतरराष्ट्रीय क्रिकेट परिषद के नए चेयरमैन बन गए हैं। क्या यह सही है? Yes or No?'
output2 = generate_response(prompt, max_length=60)
print(output2)

Prompt: अंतरराष्ट्रीय क्रिकेट परिषद के चेयरमैन का नाम क्या है?
A:
Response: The name of the Chairman of the International Cricket Council is Shashank Manohar.
Prompt: खबर यह है कि जय शाह अंतरराष्ट्रीय क्रिकेट परिषद के नए चेयरमैन बन गए हैं। क्या यह सही है? Yes or No?
जवाबः
Response: Yes, that Jay Shah has become the new chairman of the International Cricket Council.


- With RAG

In [None]:
# Write 'What is the name of the chairman of International Cricket Council?' in Hindi
prompt = 'अंतरराष्ट्रीय क्रिकेट परिषद के चेयरमैन का नाम क्या है?'
output1 = generate_response(prompt, top_k=4)
print(output1)

# Write 'THe news is that Jay Shah become the new chairman of International Cricket Council. Is it correct?' in Hindi
prompt = 'खबर यह है कि जय शाह अंतरराष्ट्रीय क्रिकेट परिषद के नए चेयरमैन बन गए हैं। क्या यह सही है? Yes or No?'
output2 = generate_response(prompt, top_k=4, print_retrieved_docs=False)
print(output2)

Retrieved facts: 
0. "When Jay Shah steps down, the PCB chief will take over."Jay Shah On ICC Job"I am humbled by the nomination as the Chair of the International Cricket Council," Shah stated after being elected as the new ICC boss unopposed.Listen to the latest songs, only on JioSaavn.com"I am committed to working closely with the ICC team and our member nations to further globalize cricket. We stand at a critical juncture where it is increasingly important to balance the coexistence of multiple formats, promote the adoption of advanced technologies, and introduce our marquee events to new global markets. Our goal is to
1. the PTI source quoted. "When Jay Shah steps down, the PCB chief will take over."Jay Shah On ICC Job"I am humbled by the nomination as the Chair of the International Cricket Council," Shah stated after being elected as the new ICC boss unopposed.Listen to the latest songs, only on JioSaavn.com"I am committed to working closely with the ICC team and our member nation

Example 3:   
- Without RAG

In [None]:
# Write 'What is the real name of the Indian cricketer, Virat Kohli?' in Hindi
prompt = 'भारतीय क्रिकेटर, विराट कोहली का वास्तविक नाम क्या है?'
output1 = generate_response(prompt, max_length=60)
print(output1)

# Write 'I have heard that the Indian cricketer, Virat Kohli has a different name. What is it?' in Hindi
prompt = 'मैंने सुना है कि भारतीय क्रिकेटर, विराट कोहली का एक अलग नाम है। वह नाम क्या है?'
output2 = generate_response(prompt, max_length=60)
print(output2)

Prompt: भारतीय क्रिकेटर, विराट कोहली का असली नाम क्या है?
A:
Response: Virat Kohli's real name is Virat Anil Kumar Kohli.
Prompt: मैंने सुना है कि भारतीय क्रिकेटर, विराट कोहली का एक अलग नाम है। वह क्या है?
Response: हां, यह सही है। Virat Kohli is a famous Indian cricketer. उनका असली नाम वकार उल हसन खान है, लेकिन उन्हें अपने प्रशंसकों के बीच वकार या


- With RAG

In [None]:
# Write 'What is the real name of the Indian cricketer, Virat Kohli?' in Hindi
prompt = 'भारतीय क्रिकेटर, विराट कोहली का वास्तविक नाम क्या है?'
output1 = generate_response(prompt, top_k=5)
print(output1)

# Write 'I have heard that the Indian cricketer, Virat Kohli has a different name. What is it?' in Hindi
prompt = 'मैंने सुना है कि भारतीय क्रिकेटर, विराट कोहली का एक अलग नाम है। वह नाम क्या है?'
output2 = generate_response(prompt, top_k=2, print_retrieved_docs=False)
print(output2)

Retrieved facts: 
0. after Sachin Tendulkar and Suresh Raina, Virat Kohli is the player who has two ODI hundred in his name.He scored more than 1000 runs or more for the three consecutive calendars in after and got the fourth position after Sachin Tendulkar, M.S Dhoni, and Sourav.He has a tattoo of a golden dragon on his forearm and believes that it is a good luck for him.On World cup debut he was the first who scored a century.Sir Vivian Richards said that the game of Virat Kohli reminds him of himself.Virat Kohli is a football lover and he has his own football
1. (Cricket Player)Rate Virat Kohli as Cricket Player here Rating By 93 Users1 Reviews(Write your opinion)PERSONAL INFORMATIONREAL NAMEVirat Kohli NICK NAMESCheekuBIRTHDAYNovember 5, 1988 BORN ON DAYSaturdayNEXT BIRTHDAY ONTuesdayBIRTHPLACENew DelhiAGE35 Years 10 Months 0 DaysNATIONALITYIndianPROFESSIONSports Persons (Cricket Player)ZODIACSCORPIOFEATURESHair:Black, Eyes:Dark BrownPhysical Stats CHEST42 '' BICEPS15 '' WAIST31 ''