<a href="https://colab.research.google.com/github/Prajwal011/LLM-s/blob/main/Rags_from_scratch_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [70]:
pip install -q -U sentence-transformers scikit-learn

##Using SentenceTransformers for Embedding

In [71]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from warnings import filterwarnings
filterwarnings("ignore")

# 1. Initialize the model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# 2. List of sentences to encode (the "database")
sentences = [
    "I love machine learning.",
    "Artificial intelligence is fascinating.",
    "Python is a great programming language.",
    "I enjoy building models with data.",
    "Natural language processing is a part of AI."
]

# 3. Convert sentences into embeddings (vectors)
sentence_embeddings = model.encode(sentences)

# 4. Function to find similar sentences using different similarity metrics
def find_similar_sentences(query, metric='cosine', top_n=3):
    # Encode the query sentence
    query_embedding = model.encode([query])

    # Select similarity metric
    if metric == 'cosine':
        similarities = cosine_similarity(query_embedding, sentence_embeddings)
    elif metric == 'dot_product':
        # Compute dot product similarity
        similarities = np.dot(query_embedding, sentence_embeddings.T)
    elif metric == 'euclidean':
        # Compute Euclidean distance and convert to similarity
        similarities = [1 / (1 + np.sum((sentence_embeddings - query_embedding)**2,axis=1))]
    elif metric == 'manhattan':
        # Compute Euclidean distance and convert to similarity
        similarities = [1 / (1 + np.sum(np.abs(sentence_embeddings - query_embedding),axis=1))]
    else:
        print("Unsupported similarity metric. Choose from 'cosine', 'dot_product', 'euclidean'")

    # Get the top_n most similar sentences
    similar_indices = np.argsort(similarities[0])[::-1][:top_n]
    similar_sentences = [(sentences[i], similarities[0][i]) for i in similar_indices]

    return similar_sentences

# Example query
query_sentence = "I enjoy building models"
similar_sentences_cosine = find_similar_sentences(query_sentence, metric='cosine', top_n=2)
similar_sentences_dot = find_similar_sentences(query_sentence, metric='dot_product', top_n=2)
similar_sentences_euclidean = find_similar_sentences(query_sentence, metric='euclidean', top_n=2)
similar_sentences_manhattan = find_similar_sentences(query_sentence, metric='manhattan', top_n=2)

# Output the results
print("Cosine Similarity:")
for sentence, score in similar_sentences_cosine:
    print(f"Sentence: {sentence}, Similarity Score: {score:.4f}")

print("\nDot Product Similarity:")
for sentence, score in similar_sentences_dot:
    print(f"Sentence: {sentence}, Similarity Score: {score:.4f}")

print("\nEuclidean Distance (converted to similarity):")
for sentence, score in similar_sentences_euclidean:
    print(f"Sentence: {sentence}, Similarity Score: {score:.4f}")

print("\nManhattan Distance (converted to similarity):")
for sentence, score in similar_sentences_manhattan:
    print(f"Sentence: {sentence}, Similarity Score: {score:.4f}")

Cosine Similarity:
Sentence: I enjoy building models with data., Similarity Score: 0.7920
Sentence: I love machine learning., Similarity Score: 0.3700

Dot Product Similarity:
Sentence: I enjoy building models with data., Similarity Score: 38.3291
Sentence: I love machine learning., Similarity Score: 20.3586

Euclidean Distance (converted to similarity):
Sentence: I enjoy building models with data., Similarity Score: 0.0449
Sentence: I love machine learning., Similarity Score: 0.0142

Manhattan Distance (converted to similarity):
Sentence: I enjoy building models with data., Similarity Score: 0.0136
Sentence: I love machine learning., Similarity Score: 0.0077


##Creating embedding from scratch

In [72]:
sentences = [
    "I love machine learning.",
    "Artificial intelligence is fascinating.",
    "Python is a great programming language.",
    "I enjoy building models with data.",
    "Natural language processing is a part of AI."
]

# Step 1: Preprocess sentences
# Remove periods and convert all sentences into lowercase
processed_sentences = [sentence.lower().replace('.', '') for sentence in sentences]

# Step 2: Create a list of all unique words in the corpus
word_list = ' '.join(processed_sentences).split()

unique_words = sorted(set(word_list))

# Len of sentence with max length we'll need it for padding
max_words = max(map(len,[i.split() for i in sentences]))

# Step 3: Create encoding and decoding dictionaries for the words
encode = {w: i for i, w in enumerate(word_list)}
decode = {i: w for i, w in enumerate(word_list)}

# Step 4: Convert sentences into embeddings (vectors)
def sentence_to_embedding(sentence):
    # As all our sentences are not of same length we'll convert them in same length using padding
    embedding = [0] * max_words

    # Split the sentence into words
    words_in_sentence = sentence.split()

    # Set corresponding positions to 1 for each word in the sentence
    for i,word in enumerate(words_in_sentence):
        if word in encode:
            index = encode[word]
            # print(i,word,index)
            embedding[i] = index

    return embedding

# Step 5: Convert each sentence into an embedding
sentence_embeddings = [sentence_to_embedding(sentence) for sentence in processed_sentences]

# Output the results
print("Unique Words (Vocabulary):", unique_words)
print("\nSentence Embeddings:")
for i, embedding in enumerate(sentence_embeddings):
    print(f"Sentence {i+1}: {sentences[i]}")
    print(f"Embedding: {embedding}\n")

Unique Words (Vocabulary): ['a', 'ai', 'artificial', 'building', 'data', 'enjoy', 'fascinating', 'great', 'i', 'intelligence', 'is', 'language', 'learning', 'love', 'machine', 'models', 'natural', 'of', 'part', 'processing', 'programming', 'python', 'with']

Sentence Embeddings:
Sentence 1: I love machine learning.
Embedding: [14, 1, 2, 3, 0, 0, 0, 0]

Sentence 2: Artificial intelligence is fascinating.
Embedding: [4, 5, 23, 7, 0, 0, 0, 0]

Sentence 3: Python is a great programming language.
Embedding: [8, 23, 24, 11, 12, 21, 0, 0]

Sentence 4: I enjoy building models with data.
Embedding: [14, 15, 16, 17, 18, 19, 0, 0]

Sentence 5: Natural language processing is a part of AI.
Embedding: [20, 21, 22, 23, 24, 25, 26, 27]



using Custom Embedding with Custom Vector search index

In [73]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

sentences = [
    "I love machine learning.",
    "Artificial intelligence is fascinating.",
    "Python is a great programming language.",
    "I enjoy building models with data.",
    "Natural language processing is a part of AI."
]

# Step 1: Preprocess sentences
# Remove periods and convert all sentences into lowercase
processed_sentences = [sentence.lower().replace('.', '') for sentence in sentences]

# Step 2: Create a list of all unique words in the corpus
word_list = ' '.join(processed_sentences).split()

unique_words = sorted(set(word_list))

# Len of sentence with max length we'll need it for padding
max_words = max(map(len,[i.split() for i in sentences]))

# Step 3: Create encoding and decoding dictionaries for the words
encode = {w: i for i, w in enumerate(word_list)}
decode = {i: w for i, w in enumerate(word_list)}

# Step 4: Convert sentences into embeddings (vectors)
def sentence_to_embedding(sentence):
    # As all our sentences are not of same length we'll convert them in same length using padding
    embedding = [0] * max_words

    # Split the sentence into words
    words_in_sentence = sentence.split()

    # Set corresponding positions to 1 for each word in the sentence
    for i,word in enumerate(words_in_sentence):
        if word in encode:
            index = encode[word]
            # print(i,word,index)
            embedding[i] = index

    return np.array(embedding)

# Step 5: Convert each sentence into an embedding
sentence_embeddings = np.array([sentence_to_embedding(sentence) for sentence in processed_sentences])

# 4. Function to find similar sentences using different similarity metrics
def find_similar_sentences(query, metric='cosine', top_n=3):
    # Encode the query sentence
    query_embedding = [sentence_to_embedding(query)]
    # print(sentence_embeddings,query_embedding)
    # Select similarity metric
    if metric == 'cosine':
        similarities = cosine_similarity(query_embedding, sentence_embeddings)
    elif metric == 'dot_product':
        # Compute dot product similarity
        similarities = np.dot(query_embedding, sentence_embeddings.T)
    elif metric == 'euclidean':
        # Compute Euclidean distance and convert to similarity
        similarities = [1 / (1 + np.sum((sentence_embeddings - query_embedding)**2,axis=1))]
    elif metric == 'manhattan':
        # Compute Euclidean distance and convert to similarity
        similarities = [1 / (1 + np.sum(np.abs(sentence_embeddings - query_embedding),axis=1))]
    else:
        print("Unsupported similarity metric. Choose from 'cosine', 'dot_product', 'euclidean'")

    # Get the top_n most similar sentences
    similar_indices = np.argsort(similarities[0])[::-1][:top_n]
    similar_sentences = [(sentences[i], similarities[0][i]) for i in similar_indices]

    return similar_sentences

# Example query
query_sentence = "I enjoy building models"
similar_sentences_cosine = find_similar_sentences(query_sentence, metric='cosine', top_n=2)
similar_sentences_dot = find_similar_sentences(query_sentence, metric='dot_product', top_n=2)
similar_sentences_euclidean = find_similar_sentences(query_sentence, metric='euclidean', top_n=2)
similar_sentences_manhattan = find_similar_sentences(query_sentence, metric='manhattan', top_n=2)

# Output the results
print("Cosine Similarity:")
for sentence, score in similar_sentences_cosine:
    print(f"Sentence: {sentence}, Similarity Score: {score:.4f}")

print("\nDot Product Similarity:")
for sentence, score in similar_sentences_dot:
    print(f"Sentence: {sentence}, Similarity Score: {score:.4f}")

print("\nEuclidean Distance (converted to similarity):")
for sentence, score in similar_sentences_euclidean:
    print(f"Sentence: {sentence}, Similarity Score: {score:.4f}")

print("\nManhattan Distance (converted to similarity):")
for sentence, score in similar_sentences_manhattan:
    print(f"Sentence: {sentence}, Similarity Score: {score:.4f}")

Cosine Similarity:
Sentence: Artificial intelligence is fascinating., Similarity Score: 0.8140
Sentence: Python is a great programming language., Similarity Score: 0.7623

Dot Product Similarity:
Sentence: Natural language processing is a part of AI., Similarity Score: 1058.0000
Sentence: Python is a great programming language., Similarity Score: 916.0000

Euclidean Distance (converted to similarity):
Sentence: Artificial intelligence is fascinating., Similarity Score: 0.0038
Sentence: I love machine learning., Similarity Score: 0.0013

Manhattan Distance (converted to similarity):
Sentence: Artificial intelligence is fascinating., Similarity Score: 0.0312
Sentence: I enjoy building models with data., Similarity Score: 0.0192
