<a href="https://colab.research.google.com/github/PramodGuvvala07/Sitafal-Task/blob/main/Sitafal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss  # No longer gives an error

class DataIngestion:
    def __init__(self, urls):
        self.urls = urls
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.embeddings = []
        self.metadata = []

    def crawl_and_scrape(self):
        for url in self.urls:
            result = requests.get(url)
            soup = BeautifulSoup(result.text, 'html.parser')
            text = soup.get_text()
            self.process_content(text, url)

    def process_content(self, text, url):
        chunks = self.segment_content(text)
        for chunk in chunks:
            embedding = self.model.encode(chunk)
            self.embeddings.append(embedding)
            self.metadata.append(url)

    def segment_content(self, text):
        return text.split('\n\n')  # Simple segmentation by paragraphs

    def store_embeddings(self):
        embedding_matrix = np.array(self.embeddings).astype('float32')
        index = faiss.IndexFlatL2(embedding_matrix.shape[1])
        index.add(embedding_matrix)
        faiss.write_index(index, 'embeddings.index')

class QueryHandler:
    def __init__(self, index, model):
        self.index = index
        self.model = model

    def handle_query(self, query):
        query_embedding = self.model.encode(query)
        D, I = self.index.search(np.array([query_embedding]).astype('float32'), k=5)
        return I

from transformers import pipeline

class ResultGenerator:
    def __init__(self):
        self.llm = pipeline('text-generation', model='gpt2')

    def generate_result(self, relevant_chunks, user_query):
        context = " ".join(relevant_chunks)
        prompt = f"Context: {context}\nQuestion: {user_query}\nAnswer:"
        result = self.llm(prompt, max_length=150)
        return result[0]['generated_text']

if __name__ == "__main__":
    urls = [
        "https://www.uchicago.edu/",
        "https://www.washington.edu/",
        "https://www.stanford.edu/",
        "https://und.edu/"
    ]

    ingestion = DataIngestion(urls)
    ingestion.crawl_and_scrape()
    ingestion.store_embeddings()

    index = faiss.read_index('embeddings.index')
    query_handler = QueryHandler(index, ingestion.model)
    user_query = "What is the focus of research at the University of Chicago?"
    relevant_indices = query_handler.handle_query(user_query)

    result_generator = ResultGenerator()
    relevant_chunks = [ingestion.metadata[i] for i in relevant_indices[0]]
    result = result_generator.generate_result(relevant_chunks, user_query)

    print(result)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Context: https://www.stanford.edu/ https://www.stanford.edu/ https://www.stanford.edu/ https://www.stanford.edu/ https://und.edu/
Question: What is the focus of research at the University of Chicago?
Answer:
The Department of Physics and Astronomy manages and manages the University's "Science, Engineering, and Math" Department at the University of Chicago, focusing on the fundamental physics and engineering disciplines. The Department is run by the College of Medicine and Dentistry, and is one of the first departments to recognize the emerging technology economy of the 21st century. The College recently announced that it would establish the Center for the Study of Technology (CSP
