<a href="https://colab.research.google.com/github/SAIRAALMAS88/SAIRAALMAS88/blob/main/SearchEngine_tf_idf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project: Custom Search Engine Using Fine-Tuned LLMs


In [1]:
!pip install scikit-learn nltk # Installing essential python packages



In [2]:
# Loading Libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
import string

# Download NLTK stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# prepare the corpus
documents = [
    "The quick brown fox jumps over the lazy dog.",
    "A journey of a thousand miles begins with a single step.",
    "To be or not to be, that is the question.",
    "Artificial intelligence is the future of technology.",
    "The sun rises in the east and sets in the west.",
    "Practice makes perfect.",
    "The early bird catches the worm.",
    "All that glitters is not gold.",
    "Actions speak louder than words.",
    "Knowledge is power."
]

In [4]:
# Pre processing the Text
def preprocess(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Preprocess the documents
processed_docs = [preprocess(doc) for doc in documents]

In [5]:
# Compute TF_IDF vectors
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(processed_docs)

In [6]:
# Define the Search function
def search(query, top_k=3):
    # Preprocess the query
    processed_query = preprocess(query)
    # Convert query to TF-IDF vector
    query_vector = vectorizer.transform([processed_query])
    # Compute cosine similarity between query and documents
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    # Sort documents by similarity score
    sorted_indices = np.argsort(similarities)[::-1]
    # Return top-k results
    results = [(documents[i], similarities[i]) for i in sorted_indices[:top_k]]
    return results

In [7]:
# Test the Search Engine
query = "What is the future of technology?"
results = search(query, top_k=3)

print("Query:", query)
print("\nTop Results:")
for i, (doc, score) in enumerate(results):
    print(f"{i+1}. Document: {doc}\n   Similarity Score: {score:.4f}\n")

Query: What is the future of technology?

Top Results:
1. Document: Artificial intelligence is the future of technology.
   Similarity Score: 0.7071

2. Document: Knowledge is power.
   Similarity Score: 0.0000

3. Document: Actions speak louder than words.
   Similarity Score: 0.0000

