<a href="https://colab.research.google.com/github/Pratibhamore01/Prompt-Engineering-Class/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Part 1: Word Embedding Arithmetic
!pip install torch transformers scipy numpy



In [None]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from scipy.spatial.distance import cosine

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get word embeddings
def get_embedding(word):
    inputs = tokenizer(word, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Function to perform word arithmetic
def word_arithmetic(word1, word2, word3):
    emb1 = get_embedding(word1)
    emb2 = get_embedding(word2)
    emb3 = get_embedding(word3)
    return emb1 - emb2 + emb3

# Function to find the most similar word from a list
def find_most_similar(embedding, word_list):
    word_embeddings = {word: get_embedding(word) for word in word_list}
    similarities = {word: 1 - cosine(embedding, emb) for word, emb in word_embeddings.items()}
    most_similar_word = max(similarities, key=similarities.get)
    return most_similar_word, similarities[most_similar_word]

# Define examples for word arithmetic
examples = [
    ('paris', 'france', 'italy', ['rome', 'romaine', 'romania', 'ronnie', 'random']),
    ('man', 'woman', 'queen', ['king', 'prince', 'duke', 'lord', 'gentleman']),
    ('car', 'vehicle', 'plane', ['train', 'boat', 'rocket', 'bike', 'scooter']),
    ('apple', 'fruit', 'orange', ['banana', 'grape', 'mango', 'lemon', 'pear']),
    ('dog', 'animal', 'cat', ['fox', 'rabbit', 'wolf', 'bear', 'lion'])
]

# Run word arithmetic and find the most similar word
for word1, word2, word3, word_list in examples:
    result_emb = word_arithmetic(word1, word2, word3)
    most_similar, similarity = find_most_similar(result_emb, word_list)
    print(f"{word1} - {word2} + {word3} is most similar to: {most_similar} (similarity: {similarity:.4f})")


paris - france + italy is most similar to: rome (similarity: 0.8295)
man - woman + queen is most similar to: king (similarity: 0.8782)
car - vehicle + plane is most similar to: boat (similarity: 0.8585)
apple - fruit + orange is most similar to: lemon (similarity: 0.7888)
dog - animal + cat is most similar to: rabbit (similarity: 0.8461)


In [2]:
#Part 2: RAG System Implementation
!pip install langchain transformers sentence-transformers faiss-cpu wikipedia

Collecting langchain
  Downloading langchain-0.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.1.0-py3-none-any.whl.metadata (23 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting langchain-core<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_core-0.3.0-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_text_splitters-0.3.0-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.121-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting jsonpatch<2.0,>=1.33 (fro

In [3]:
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.0-py3-none-any.whl.metadata (2.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.5.2-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.22.0-py3-none-any.whl.metadata (7.2 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloa

In [33]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from sentence_transformers import SentenceTransformer
import wikipedia

# Step 1: Initialize the embedding model
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding_model = HuggingFaceEmbeddings(model_name=model_name)

# Step 2: Load Wikipedia articles
def get_wikipedia_article(title):
    try:
        return wikipedia.page(title).content
    except wikipedia.exceptions.PageError:
        return f"Article {title} not found."

articles = {
    "Artificial Intelligence": get_wikipedia_article("Artificial intelligence"),
    "History of the Internet": get_wikipedia_article("History of the Internet"),
    "Quantum Computing": get_wikipedia_article("Quantum computing"),
    "Climate Change": get_wikipedia_article("Climate change"),
    "Evolutionary Biology": get_wikipedia_article("Evolutionary biology"),
}

# Step 3: Create embeddings for each article
article_titles = list(articles.keys())
article_contents = list(articles.values())

# Generate embeddings for each article
article_embeddings = embedding_model.embed_documents(article_contents)

# Step 4: Store embeddings in a FAISS vector database
vector_db = FAISS.from_texts(texts=article_contents, embedding=embedding_model)


# Step : Formulate queries
queries = [
    "What are the main applications of Artificial Intelligence?",
    "What were the significant milestones in the history of the Internet?",
    "How does quantum computing differ from classical computing?",
    "What are the primary effects of climate change?",
    "Describe the theory of natural selection in evolutionary biology."
]


