In [None]:
!pip install transformers accelerate datasets sentence-transformers




In [None]:
import pandas as pd

# Load the two CSV files
file1 = 'bhagavad_gita.csv'
file2 = 'patanjali_yoga.csv'

# Read CSV files into dataframes
df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)

# Combine the dataframes
combined_df = pd.concat([df1, df2], ignore_index=True)

# Save the combined dataframe to a new CSV file
combined_df.to_csv('combined_dataset.csv', index=False)

print("Files combined successfully into 'combined_scriptures.csv'")


Files combined successfully into 'combined_scriptures.csv'


In [None]:
!pip install faiss-gpu




In [None]:
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: read)

In [None]:
!pip install --upgrade transformers sentencepiece


In [None]:
from transformers import LlamaTokenizer

model_name = "meta-llama/Llama-2-7b-hf"
token = "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"  # Replace with your token

tokenizer = LlamaTokenizer.from_pretrained(
    model_name,
    use_auth_token=token,
    trust_remote_code=True
)
print("Tokenizer loaded successfully!")




Tokenizer loaded successfully!


In [None]:
from transformers import LlamaTokenizer, LlamaForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import faiss
import pandas as pd
import torch


# Load and preprocess the dataset
def preprocess_dataset(file_path):
    dataset = load_dataset("csv", data_files=file_path)

    # Preprocess the data
    def preprocess_function(examples):
        inputs = [f"Question: {q}" for q in examples["question"]]
        outputs = [
            f"""Translation: {t}
ID: {i}
Chapter: {c}
Verse: {v}
Sanskrit: {s}"""
            for t, i, c, v, s in zip(
                examples["translation"],
                examples["verse"],
                examples["chapter"],
                examples["verse"],
                examples["sanskrit"],
            )
        ]
        return {"input_text": inputs, "target_text": outputs}

    processed_dataset = dataset.map(preprocess_function, batched=True)
    return processed_dataset


# Fine-tune LLaMA model
def fine_tune_model(dataset, model_name, output_dir):
    tokenizer = LlamaTokenizer.from_pretrained(model_name)

    # Ensure tokenizer has a padding token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token  # Use EOS token as padding token
        # Alternatively, add a new pad token:
        # tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    # Tokenize the data
    def tokenize_function(examples):
        model_inputs = tokenizer(
            examples["input_text"], max_length=512, truncation=True, padding="max_length"
        )
        labels = tokenizer(
            examples["target_text"], max_length=512, truncation=True, padding="max_length"
        )["input_ids"]
        model_inputs["labels"] = labels
        return model_inputs

    tokenized_dataset = dataset.map(tokenize_function, batched=True)

    # Load the model
    model = LlamaForCausalLM.from_pretrained(model_name)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        learning_rate=5e-5,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        num_train_epochs=3,
        weight_decay=0.01,
        save_strategy="epoch",
        logging_dir="./logs",
        logging_steps=10,
        fp16=True,
        push_to_hub=False,
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        tokenizer=tokenizer,
    )

    # Train the model
    trainer.train()

    return model, tokenizer


# Build FAISS index for similarity search
def build_faiss_index(questions, embedding_model_name):
    sbert_model = SentenceTransformer(embedding_model_name)
    embeddings = sbert_model.encode(questions, convert_to_tensor=True).cpu().detach().numpy()

    # Build FAISS index
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)

    return index, sbert_model


# Inference
def perform_inference(question, index, sbert_model, df, model, tokenizer):
    # Query FAISS index
    query_embedding = sbert_model.encode([question], convert_to_tensor=True).cpu().detach().numpy()
    distances, indices = index.search(query_embedding, 1)
    closest_row = df.iloc[indices[0][0]]

    # Prepare input for the model
    input_text = f"Question: {question}"
    inputs = tokenizer(input_text, return_tensors="pt")

    # Generate response
    outputs = model.generate(**inputs, max_length=512)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return {
        "closest_match": closest_row.to_dict(),
        "model_response": response,
    }


# Main pipeline
if __name__ == "__main__":
    # File paths and model names
    dataset_file = "combined_dataset.csv"
    llama_model_name = "meta-llama/Llama-2-7b-hf"
    embedding_model_name = "all-MiniLM-L6-v2"
    output_dir = "./fine_tuned_llama"

    # Step 1: Preprocess dataset
    dataset = preprocess_dataset(dataset_file)
    df = pd.read_csv(dataset_file)
    questions = df["question"].tolist()

    # Step 2: Fine-tune the model
    model, tokenizer = fine_tune_model(dataset, llama_model_name, output_dir)

    # Step 3: Build FAISS index
    index, sbert_model = build_faiss_index(questions, embedding_model_name)

    # Step 4: Inference
    test_question = "What is the essence of karma?"
    result = perform_inference(test_question, index, sbert_model, df, model, tokenizer)

    # Print results
    print("Closest Match:")
    print(result["closest_match"])
    print("\nModel Response:")
    print(result["model_response"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# try 2

In [None]:
!pip install transformers datasets pandas torch


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Book1.csv')  # Replace with your file name

# Split questions separated by '?'
def split_questions(row):
    questions = row['question'].split('?')  # Split by '?'
    questions = [q.strip() + '?' for q in questions if q.strip()]  # Add '?' back and remove empty entries
    return [(q, row['answer']) for q in questions]

# Expand the dataset
expanded_data = []
for _, row in df.iterrows():
    expanded_data.extend(split_questions(row))

# Create a new DataFrame
preprocessed_df = pd.DataFrame(expanded_data, columns=['Question', 'Answer'])

# Save the preprocessed data for fine-tuning
preprocessed_df.to_csv('preprocessed_qa_data.csv', index=False)

print("Preprocessed dataset saved as 'preprocessed_qa_data.csv'")


Preprocessed dataset saved as 'preprocessed_qa_data.csv'


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd

# Load the dataset
df = pd.read_csv('p.csv')  # Replace with your file name

# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df)

# Preprocessing function
def preprocess_function(examples):
    inputs = ["question: " + q for q in examples['Question']] # This line is changed
    targets = examples['Answer']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length") # This line is changed
    with tokenizer.as_target_tokenizer(): # This line is added
        labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length").input_ids # This line is changed
    model_inputs["labels"] = labels
    return model_inputs

# Load the tokenizer and model
model_name = "t5-small"  # You can replace this with "t5-base" or "google/flan-t5-small" for better results
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Preprocess the dataset
tokenized_data = dataset.map(preprocess_function, batched=True)

# Split the dataset into train and validation sets
split_dataset = tokenized_data.train_test_split(test_size=0.1)  # 90% train, 10% validation
train_data = split_dataset['train']
val_data = split_dataset['test']

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Directory to save checkpoints
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=3,
    logging_dir="./logs",
    #predict_with_generate=True,
    logging_steps=10,
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_t5")
tokenizer.save_pretrained("./fine_tuned_t5")


Map:   0%|          | 0/891 [00:00<?, ? examples/s]

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.9386,0.414727
2,0.474,0.402366
3,0.4496,0.397387


('./fine_tuned_t5/tokenizer_config.json',
 './fine_tuned_t5/special_tokens_map.json',
 './fine_tuned_t5/spiece.model',
 './fine_tuned_t5/added_tokens.json',
 './fine_tuned_t5/tokenizer.json')

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the fine-tuned model
model_path = "./fine_tuned_t5"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# Function for inference
def answer_question(question):
    inputs = tokenizer(f"question: {question}", return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs.input_ids, max_length=512, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test the model
question = "Which warriors from the Pandava army did Duryodhana first talk about?"
print("Answer:", answer_question(question))


Answer: 


# test3

In [None]:
import pandas as pd

# Load the dataset, adjust delimiter if needed
df = pd.read_csv('Book1.csv')  # Or delimiter=';', delimiter=' ', etc.

# Check the actual column names in your DataFrame
print(df.columns)  # Print the column names

# Replace 'answer' with the actual column name if it's different
# For example, if the column name is 'answer', use:
df['answer'] = df['answer'].str.strip()  # Assuming the column name is 'answer'
df['question'] = df['question'].str.strip()

# Save the cleaned dataset
df.to_csv('cleaned_qa_data.csv', index=False)
print("Dataset cleaned and saved as 'cleaned_qa_data.csv'")

ParserError: Error tokenizing data. C error: Expected 2 fields in line 3, saw 4


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import Dataset

# Load the cleaned dataset
df = pd.read_csv('cleaned_qa_data.csv')

# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df)

# Preprocessing function
def preprocess_function(examples):
    inputs = ["question: " + q for q in examples['question']]  # Iterate through the list of questions
    targets = examples['answer']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs


In [None]:
# Load the tokenizer and model
model_name = "t5-small"  # Replace with "t5-base" or "google/flan-t5-small" for better results
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Preprocess the dataset
tokenized_data = dataset.map(preprocess_function, batched=True)

# Split the dataset into train and validation sets
split_dataset = tokenized_data.train_test_split(test_size=0.1)  # 90% train, 10% validation
train_data = split_dataset['train']
val_data = split_dataset['test']


Map:   0%|          | 0/700 [00:00<?, ? examples/s]



In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=3,
    logging_dir="./logs",
    #predict_with_generate=True,
    logging_steps=10,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
)

# Start fine-tuning
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine1_tuned_t5")
tokenizer.save_pretrained("./fine1_tuned_t5")
print("Model fine-tuned and saved!")


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.6047,0.434593
2,0.6317,0.412707
3,0.512,0.411199


Model fine-tuned and saved!


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the fine-tuned model
model_path = "./fine1_tuned_t5"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# Function for inference
def answer_question(question):
    inputs = tokenizer(f"question: {question}", return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs.input_ids, max_length=512, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test the model
question = "How does the Gita start?"
print("Answer:", answer_question(question))


Answer: 


# TEs1

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("Book1.csv")  # Adjust delimiter if needed

# Combine multiple questions for each answer into a single string
df['Combined_Questions'] = df['question'].apply(lambda x: " ".join(x.split("?")).strip())

# Save the preprocessed data
df.to_csv("preprocessed_qa_data.csv", index=False)
print("Preprocessed dataset saved!")


Preprocessed dataset saved!


In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load a pretrained SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Use 'all-MiniLM-L6-v2' or similar for lightweight embedding

# Load the preprocessed dataset
df = pd.read_csv("preprocessed_qa_data.csv")

# Generate embeddings for combined questions and answers
df['Question_Embedding'] = df['Combined_Questions'].apply(lambda x: model.encode(x, convert_to_numpy=True))
df['Answer_Embedding'] = df['answer'].apply(lambda x: model.encode(x, convert_to_numpy=True))

# Save embeddings to a file
np.save('question_embeddings.npy', np.stack(df['Question_Embedding'].values))
np.save('answer_embeddings.npy', np.stack(df['Answer_Embedding'].values))
print("Embeddings saved!")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embeddings saved!


In [None]:
!pip install faiss-cpu # or !pip install faiss-gpu for GPU support

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m54.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [None]:
import faiss
import numpy as np

# Load question embeddings
question_embeddings = np.load('question_embeddings.npy')

# Initialize a FAISS index
d = question_embeddings.shape[1]  # Embedding dimension
index = faiss.IndexFlatL2(d)  # L2 similarity
index.add(question_embeddings)  # Add embeddings to the index

# Save the index for future use
faiss.write_index(index, "faiss_index")
print("FAISS index created and saved!")


FAISS index created and saved!


In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load FAISS index and embeddings
index = faiss.read_index("faiss_index")
question_embeddings = np.load('question_embeddings.npy')
df = pd.read_csv("preprocessed_qa_data.csv")

# Load the same SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to find the most similar question and corresponding answer
def retrieve_answer(input_question, top_k=5):
    # Generate embedding for the input question
    input_embedding = model.encode(input_question, convert_to_numpy=True)
    input_embedding = np.expand_dims(input_embedding, axis=0)  # FAISS expects 2D input

    # Search the vector database
    distances, indices = index.search(input_embedding, top_k)

    # Retrieve the corresponding answers
    results = []
    for idx in indices[0]:
        question = df.iloc[idx]['Combined_Questions']
        answer = df.iloc[idx]['answer']
        results.append((question, answer))
    return results

# Test the function
question = "How many kinds of foods are there"
results = retrieve_answer(question)

# Display results
for q, a in results:
    print(f"Similar Question: {q}\nAnswer: {a}\n")


Similar Question: How many kinds of foods are there
Answer: The food that is dear to each is threefold, as well as sacrifice, austerity, and almsgiving. Hear the distinction of these.

Similar Question: Which foods are considered to be sattvic  Which foods are considered to be pure  Are sweets sattvic, rajasic or tamasic
Answer: The foods that increase life, purity, strength, health, joy, and cheerfulness (good appetite), which are savory, oily, substantial, and agreeable, are dear to the Sattvic (pure) people.

Similar Question: Which foods are considered to be rajasic  Which foods are considered to generate energy  Are sour foods sattvic, rajasic or tamasic
Answer: The foods that are bitter, sour, salty, overly hot, pungent, dry, and burning are liked by the Rajasic and are productive of pain, grief, and disease.

Similar Question: What is the importance of food
Answer: From food come forth beings; from rain, food is produced; from sacrifice arises rain, and sacrifice is born of acti

In [None]:
!pip install pinecone-client sentence-transformers pandas


Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone_client-5.0.1-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.8/244.8 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_inference-1.1.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone-plugin-inference, pinecone-client
Successfully installed pinecone-client-5.0.

In [None]:
import os
from pinecone import Pinecone, Index, ServerlessSpec

# Initialize Pinecone
pc = Pinecone(
    api_key="pcsk_6E9B6o_DHFJaybC7zzr4QT9i1tZo1vExTxji5j1syULud17p1HXAzrZPN7Zv4fs9H83L98"  # Replace with your Pinecone API key
)

# Check if the index exists, create it if not
index_name = "nyd"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,  # Adjust based on your embedding dimension
        metric='cosine',  # Use cosine similarity
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

# Connect to the index using the Pinecone client
index = pc.Index(index_name)  # Access the index using the Pinecone client (pc)
print(f"Connected to index: {index_name}")

Connected to index: nyd


In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd

# Load the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Load the dataset
df = pd.read_csv("Book1.csv")

# Combine questions for each answer
df['Combined_Questions'] = df['question'].apply(lambda x: " ".join(x.split("?")).strip())

# Generate embeddings for questions
embeddings = model.encode(df['Combined_Questions'].tolist(), convert_to_numpy=True)

# Upload data to Pinecone
for i, (embedding, answer) in enumerate(zip(embeddings, df['answer'].tolist())):
    index.upsert([(str(i), embedding.tolist(), {"answer": answer})])

print("Embeddings uploaded to Pinecone!")


Embeddings uploaded to Pinecone!


In [None]:
def retrieve_answer(input_question, top_k=1):
    query_embedding = model.encode(input_question, convert_to_numpy=True)
    result = index.query(
        vector=query_embedding.tolist(),
        top_k=top_k,
        include_metadata=True
    )

    answers = []
    for match in result['matches']:
        answers.append((match['score'], match['metadata']['answer']))
    return answers

input_question = "Who were the warriors on the Kaurava "
results = retrieve_answer(input_question)


for score, answer in results:
    print(f"Score: {score}\nAnswer: {answer}\n")


Score: 0.718727887
Answer: "Thou thyself, Bhishma, Karna, Kripa, the victorious in war, Asvatthama, Vikarna, and Bhurisrava, the son of Somadatta—all these are ready for battle."



# further refining

In [None]:
import pandas as pd

# Load the first CSV file
file1 = "b.csv"
df1 = pd.read_csv(file1)

# Load the second CSV file
file2 = "p.csv"
df2 = pd.read_csv(file2)

# Combine the two DataFrames by appending df2 to df1
combined_df = pd.concat([df1, df2], ignore_index=True)

# Save the combined DataFrame to a new CSV file
output_file = "combined_file.csv"
combined_df.to_csv(output_file, index=False)

print(f"Combined CSV file saved as '{output_file}'")


Combined CSV file saved as 'combined_file.csv'


# combined

In [None]:
import os
from pinecone import Pinecone, ServerlessSpec

# Initialize Pinecone
pc = Pinecone(
    api_key="pcsk_6E9B6o_DHFJaybC7zzr4QT9i1tZo1vExTxji5j1syULud17p1HXAzrZPN7Zv4fs9H83L98"  # Replace with your Pinecone API key
)

# Check if the index exists, create it if not
index_name = "nyd"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,  # Adjust based on your embedding dimension
        metric='cosine',  # Use cosine similarity
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

# Connect to the index using the Pinecone client
index = pc.Index(index_name)
print(f"Connected to index: {index_name}")

Connected to index: nyd


In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd

# Load the model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:


# Load the dataset
df = pd.read_csv("combined_file.csv")

# Combine questions for each answer
df['Combined_Questions'] = df['question'].apply(lambda x: " ".join(x.split("?")).strip())

# Generate embeddings for questions
embeddings = model.encode(df['Combined_Questions'].tolist(), convert_to_numpy=True)

# Upload data to Pinecone with additional metadata
for i, row in df.iterrows():
    metadata = {
        "answer": row['answer'],
        "chapter": row['chapter'],
        "verse": row['verse'],
        "sanskrit": row['sanskrit']
    }
    index.upsert([(str(i), embeddings[i].tolist(), metadata)])

print("Embeddings with metadata uploaded to Pinecone!")




Connected to index: nyd
Embeddings with metadata uploaded to Pinecone!
Score: 0.717323601
Answer: "Thou thyself, Bhishma, Karna, Kripa, the victorious in war, Asvatthama, Vikarna, and Bhurisrava, the son of Somadatta—all these are ready for battle."
Chapter: 1.0
Verse: 8.0
Sanskrit: भवान्भीष्मश्च कर्णश्च कृपश्च समितिञ्जयः| अश्वत्थामा विकर्णश्च सौमदत्तिस्तथैव च  || 1.8 || 




In [None]:
# Function to retrieve answer and metadata
def retrieve_answer(input_question, top_k=1):
    query_embedding = model.encode(input_question, convert_to_numpy=True)
    result = index.query(
        vector=query_embedding.tolist(),
        top_k=top_k,
        include_metadata=True
    )

    answers = []
    for match in result['matches']:
        metadata = match['metadata']
        answers.append({
            "score": match['score'],
            "answer": metadata['answer'],
            "chapter": metadata.get('chapter', 'N/A'),
            "verse": metadata.get('verse', 'N/A'),
            "sanskrit": metadata.get('sanskrit', 'N/A')
        })
    return answers



In [None]:
# Query the database
input_question = "What did Duryodhana say"
results = retrieve_answer(input_question)

# Display results
for result in results:
    print(f"Score: {result['score']}")
    print(f"Answer: {result['answer']}")
    print(f"Chapter: {result['chapter']}")
    print(f"Verse: {result['verse']}")
    print(f"Sanskrit: {result['sanskrit']}")
    print("\n")

Score: 0.843865156
Answer: Behold, O Teacher! This mighty army of the sons of Pandu, arrayed by the son of Drupada, thy wise disciple.
Chapter: 1.0
Verse: 3.0
Sanskrit: पश्यैतां पाण्डुपुत्राणामाचार्य महतीं चमूम्| व्यूढां द्रुपदपुत्रेण तव शिष्येण धीमता  || 1.3 || 




# Agentic rag

In [1]:
!pip install pinecone-client sentence-transformers pandas
!pip install groq




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip






[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
import pandas as pd
from groq import Groq

  from tqdm.autonotebook import tqdm





In [3]:
# Initialize Pinecone
pc = Pinecone(api_key="pcsk_6E9B6o_DHFJaybC7zzr4QT9i1tZo1vExTxji5j1syULud17p1HXAzrZPN7Zv4fs9H83L98")  # Replace with your Pinecone API key

index_name = "nyd"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,  # Adjust based on your embedding dimension
        metric='cosine',  # Use cosine similarity
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )
index = pc.Index(index_name)
print(f"Connected to index: {index_name}")

Connected to index: nyd


In [4]:
# Load Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [5]:
# Load and preprocess the dataset
df = pd.read_csv("combined_file.csv")
df['Combined_Questions'] = df['question'].apply(lambda x: " ".join(x.split("?")).strip())
embeddings = model.encode(df['Combined_Questions'].tolist(), convert_to_numpy=True)

# Upload embeddings and metadata to Pinecone
for i, row in df.iterrows():
    metadata = {
        "answer": row['answer'],
        "chapter": row['chapter'],
        "verse": row['verse'],
        "sanskrit": row['sanskrit']
    }
    index.upsert([(str(i), embeddings[i].tolist(), metadata)])
print("Embeddings with metadata uploaded to Pinecone!")

Embeddings with metadata uploaded to Pinecone!


In [7]:
# Function to retrieve answers and metadata from Pinecone
def retrieve_answer(input_question, top_k=1):
    query_embedding = model.encode(input_question, convert_to_numpy=True)
    result = index.query(
        vector=query_embedding.tolist(),
        top_k=top_k,
        include_metadata=True
    )
    answers = []
    for match in result['matches']:
        metadata = match['metadata']
        answers.append({
            "score": match['score'],
            "answer": metadata['answer'],
            "chapter": metadata.get('chapter', 'N/A'),
            "verse": metadata.get('verse', 'N/A'),
            "sanskrit": metadata.get('sanskrit', 'N/A')
        })
    return answers

In [8]:
# Initialize Groq client for Llama
client = Groq(api_key="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")  # Replace with your Groq API key

In [9]:
def answer_query_from_llama(query):
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"You are an assistant specialized in answering questions strictly based on the Bhagavad Gita and Patanjali Yoga Sutra. Provide the chapter, verse, Sanskrit text, and a detailed answer to the following question: {query}."
            }
        ],
        model="llama-3.3-70b-versatile",
    )
    return chat_completion.choices[0].message.content

In [10]:
# Llama Query Refinement
def refine_query_with_llama(query, retrieved_info):
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"You are an assistant specializing in refining queries for better retrieval. "
                           f"Original query: '{query}'\n"
                           f"Retrieved information:\n{retrieved_info}\n"
                           "Refine the query to include specific details for improved results. "
                           "If the query is already precise, return it unchanged. Refined query:"
            }
        ],
        model="llama-3.3-70b-versatile",
    )
    return chat_completion.choices[0].message.content.strip()

In [11]:
# Llama Final Response Generation
def generate_final_response_with_llama(query, retrieved_info, llm_retrieved):
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"You are an expert at combining information to generate detailed answers. "
                           f"Original query: '{query}'\n"
                           f"Retrieved information from semantic search:\n{retrieved_info}\n"
                           f"Retrieved information from Llama:\n{llm_retrieved}\n"
                            "Provide chapter, verse, sanskrit, traslation if the query is directly belongs to sanskrit\n"
                            "Don't say how you process this context in the answer"
                            "Using all the provided context, generate a complete, accurate, and concise answer."
            }
        ],
        model="llama-3.3-70b-versatile",
    )
    return chat_completion.choices[0].message.content.strip()

In [22]:
# Main workflow
user_query = input("Enter your query: ")

# Step 1: Retrieve answer from Pinecone
semantic_results = retrieve_answer(user_query, top_k=3)

retrieved_info = "\n".join([
    f"Score: {item['score']}, Answer: {item['answer']}, Chapter: {item['chapter']}, Verse: {item['verse']}, Sanskrit: {item['sanskrit']}"
    for item in semantic_results
])

# Step 2: Retrieve answer from Llama
llm_result = answer_query_from_llama(user_query)

# Step 3: Refine the query with Llama
refined_query = refine_query_with_llama(user_query, retrieved_info)

# Step 4: Generate final response using Llama
final_response = generate_final_response_with_llama(refined_query, retrieved_info, llm_result)

# Display the results
print("=====================================================")
print(f"Refined Query: {refined_query}")
print("-----------------------------------------------------")
print(f"Final Response:\n{final_response}")
print("=====================================================")


Enter your query: Can Karma also cease to exist?
Refined Query: To refine the query for better retrieval, it would be beneficial to include specific details related to the context of "Karma" and its cessation. The original query is somewhat broad and does not specify the tradition or philosophical context (e.g., Buddhist, Hindu, etc.) in which "Karma" is being considered. However, given the retrieved information, it appears the context is likely from Hindu scriptures, possibly the Bhagavad Gita, given the chapter and verse references.

Refined query: "Can Karma cease to exist according to Hindu or Yoga philosophy, particularly in the context of the Bhagavad Gita or similar scriptures?" 

This refined query includes:
1. Specific philosophical context (Hindu or Yoga philosophy).
2. Reference to a particular scripture (the Bhagavad Gita) based on the retrieved information.
3. Maintains the core question about the cessation of Karma, ensuring the search remains focused on the original inqu