In [None]:
!pip install sentence-transformers faiss-gpu rank_bm25 transformers nltk torcha
!pip install huggingface_hub

In [2]:
import json
import re
import nltk
from nltk.tokenize import sent_tokenize
import torch
from sentence_transformers import SentenceTransformer, util
import faiss
import numpy as np
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from rank_bm25 import BM25Okapi  # BM25 for hybrid search
import logging
from huggingface_hub import login
import os

nltk.download('punkt', quiet=True)
login(token='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')
# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Set PYTORCH_CUDA_ALLOC_CONF to allow expandable segments
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

class Hogragger:
    def __init__(self, corpus_path, model_name='sentence-transformers/all-MiniLM-L12-v2', qa_model='deepset/roberta-large-squad2', classifier_model='deepset/roberta-large-squad2'):
        self.corpus = self.load_corpus(corpus_path)
        self.cleaned_passages = self.preprocess_corpus()
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        logging.info(f"Using device: {self.device}")

        # Initialize embedding model and build FAISS index
        self.model = SentenceTransformer(model_name).to(self.device)
        self.index = self.build_faiss_index()

        # Initialize BM25 for lexical matching
        self.bm25 = self.build_bm25_index()

        # Initialize classifier for question type prediction
        self.tokenizer = AutoTokenizer.from_pretrained(classifier_model)
        self.classifier = AutoModelForSequenceClassification.from_pretrained(classifier_model).to(self.device)

        # QA Model
        self.qa_model = pipeline('question-answering', model=qa_model, device=0 if self.device == 'cuda' else -1)

    def load_corpus(self, path):
        logging.info(f"Loading corpus from {path}")
        with open(path, "r") as f:
            corpus = json.load(f)
        logging.info(f"Loaded {len(corpus)} documents")
        return corpus

    def preprocess_corpus(self):
        cleaned_passages = []
        for article in self.corpus:
            body = article.get('body', '')
            clean_body = re.sub(r'<.*?>', '', body)  # Clean HTML tags
            clean_body = re.sub(r'\s+', ' ', clean_body).strip()  # Clean extra spaces

            # Simply take the full cleaned text as a passage without chunking or sentence splitting
            cleaned_passages.append(self.create_passage(article, clean_body))

        logging.info(f"Created {len(cleaned_passages)} passages")
        return cleaned_passages

    def create_passage(self, article, chunk):
        """Creates a passage dictionary from an article and chunk of text."""
        return {
            "title": article['title'],
            "author": article.get('author', 'Unknown'),
            "published_at": article['published_at'],
            "category": article['category'],
            "url": article['url'],
            "source": article['source'],
            "passage": chunk.strip()
        }

    def build_faiss_index(self):
        logging.info("Building FAISS index...")
        embeddings = self.model.encode([p['passage'] for p in self.cleaned_passages], convert_to_tensor=True, device=self.device)
        embeddings = np.array(embeddings.cpu()).astype('float32')
        logging.info(f"Shape of embeddings: {embeddings.shape}")

        index = faiss.IndexFlatL2(embeddings.shape[1])  # Initialize FAISS index

        if self.device == 'cuda':
            try:
                res = faiss.StandardGpuResources()
                gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
                gpu_index.add(embeddings)
                logging.info("Successfully created GPU index")
                return gpu_index
            except RuntimeError as e:
                logging.error(f"GPU index creation failed: {e}")
                logging.info("Falling back to CPU index")

        index.add(embeddings)  # Add embeddings to CPU index
        logging.info("Successfully created CPU index")
        return index

    def build_bm25_index(self):
        logging.info("Building BM25 index...")
        tokenized_corpus = [p['passage'].split() for p in self.cleaned_passages]
        bm25 = BM25Okapi(tokenized_corpus)
        logging.info("Successfully built BM25 index")
        return bm25

    def predict_question_type(self, query):
        inputs = self.tokenizer(query, return_tensors='pt').to(self.device)
        outputs = self.classifier(**inputs)
        prediction = torch.argmax(outputs.logits, dim=1).item()

        labels = {0: 'inference_query', 1: 'comparison_query', 2: 'null_query', 3: 'temporal_query', 4: 'fact_query'}
        return labels.get(prediction, 'unknown_query')

    def retrieve_passages(self, query, k=100, threshold=0.7):
        try:
            # FAISS retrieval
            query_embedding = self.model.encode([query], convert_to_tensor=True, device=self.device)
            D, I = self.index.search(np.array(query_embedding.cpu()), k)

            # BM25 retrieval
            tokenized_query = query.split()
            bm25_scores = self.bm25.get_scores(tokenized_query)

            # Combine FAISS and BM25 results
            hybrid_scores = self.combine_faiss_bm25_scores(D[0], bm25_scores, I)

            # Filter passages based on hybrid score
            passages = [self.cleaned_passages[i] for i, score in zip(I[0], hybrid_scores) if score > threshold]

            logging.info(f"Retrieved {len(passages)} passages using hybrid search for query.")
            return passages
        except Exception as e:
            logging.error(f"Error in retrieving passages: {e}")
            return []

    def combine_faiss_bm25_scores(self, faiss_scores, bm25_scores, passage_indices):
        # Normalize and combine FAISS and BM25 scores
        bm25_scores = np.array(bm25_scores)[passage_indices]
        faiss_scores = np.array(faiss_scores)

        # Convert FAISS distances into similarities by inverting the scale
        faiss_similarities = 1 / (faiss_scores + 1e-6)  # Avoid division by zero

        # Normalize scores (scale between 0 and 1)
        bm25_scores = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) - np.min(bm25_scores) + 1e-6)
        faiss_similarities = (faiss_similarities - np.min(faiss_similarities)) / (np.max(faiss_similarities) - np.min(faiss_similarities) + 1e-6)

        # Weighted combination (you can adjust weights)
        combined_scores = 0.7 * faiss_similarities + 0.3 * bm25_scores
        combined_scores = np.squeeze(combined_scores)  # Ensure it's a single-dimensional array

        return combined_scores

    def filter_passages(self, query, passages):
        try:
            query_embedding = self.model.encode(query, convert_to_tensor=True)
            passage_embeddings = self.model.encode([p['passage'] for p in passages], convert_to_tensor=True)

            similarities = util.pytorch_cos_sim(query_embedding, passage_embeddings)
            top_k = min(10, len(passages))
            top_indices = similarities.topk(k=top_k)[1].tolist()[0]

            selected_passages = []
            used_titles = set()
            for i in top_indices:
                if passages[i]['title'] not in used_titles:
                    selected_passages.append(passages[i])
                    used_titles.add(passages[i]['title'])

            return selected_passages
        except Exception as e:
            logging.error(f"Error in filtering passages: {e}")
            return []

    def generate_answer(self, query, passages):
        try:
            context = " ".join([p['passage'] for p in passages[:5]])
            answer = self.qa_model(question=query, context=context)
            logging.info(f"Generated answer: {answer['answer']}")
            return answer['answer']
        except Exception as e:
            logging.error(f"Error in generating answer: {e}")
            return "Insufficient information."

    def post_process_answer(self, answer, confidence=0.1):
        answer = re.sub(r'^.*\?', '', answer).strip()
        answer = answer.capitalize()

        if len(answer) > 100:
            truncated = re.match(r'^(.*?[.!?])\s', answer)
            if truncated:
                answer = truncated.group(1)

        if confidence < 0.1:
            logging.warning(f"Answer confidence too low: {confidence}")
            return "I'm unsure about this answer."

        return answer

    def process_query(self, query):
        question_type = self.predict_question_type(query)
        retrieved_passages = self.retrieve_passages(query, k=100, threshold=0.7)
        if not retrieved_passages:
            return {"query": query, "answer": "No relevant information found", "question_type": question_type, "evidence_list": []}

        filtered_passages = self.filter_passages(query, retrieved_passages)
        raw_answer = self.generate_answer(query, filtered_passages)

        evidence_count = min(len(filtered_passages), 4)
        evidence_list = [
            {
                "title": p['title'],
                "author": p['author'],
                "url": p['url'],
                "source": p['source'],
                "category": p['category'],
                "published_at": p['published_at'],
                "fact": self.extract_fact(p['passage'], query)
            } for p in filtered_passages[:evidence_count]
        ]
        final_answer = self.post_process_answer(raw_answer)

        return {
            "query": query,
            "answer": final_answer,
            "question_type": question_type,
            "evidence_list": evidence_list
        }

    def extract_fact(self, passage, query):
        # Extracting most relevant sentence from passage
        sentences = sent_tokenize(passage)
        query_keywords = set(query.lower().split())

        best_sentence = max(sentences, key=lambda s: len(set(s.lower().split()) & query_keywords), default="")
        return best_sentence if best_sentence else (sentences[0] if sentences else "")

  from tqdm.autonotebook import tqdm, trange


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
hogragger = Hogragger(corpus_path='/content/corpus.json')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/696 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at deepset/roberta-large-squad2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
with open('/content/train.json', 'r') as f:
    train_data = json.load(f)

In [6]:
for idx, item in enumerate(train_data[:10]):  # Limiting to first 5 queries
    query = item['query']
    print(f"Processing query {idx+1}: {query}")

    # Step 3: Run the query through the pipeline
    result = hogragger.process_query(query)

    # Step 4: Print concise result
    print(f"Predicted Answer: {result['answer']}")
    print(f"Question Type: {result['question_type']}")

    # Print first 2 evidence sources (to limit length)
    for evidence in result['evidence_list'][:]:
        print(f"Evidence: {evidence['title']} - {evidence['fact']}")

    print("\n" + "="*50 + "\n")

Processing query 1: Who is the individual associated with the cryptocurrency industry facing a criminal trial on fraud and conspiracy charges, as reported by both The Verge and TechCrunch, and is accused by prosecutors of committing fraud for personal gain?
Predicted Answer: Sam bankman-fried
Question Type: comparison_query
Evidence: Is Sam Bankman-Fried a bad ‘man’ or a good ‘boy’? Lawyers swap opening statements before first witnesses take the stand - Here’s what happened on the second day of the trial, which featured pointed allegations, a friend from MIT, and an audience replete with big names, including Bankman-Fried’s professorial parents and Damian Williams, U.S. attorney for the Southern District of New York.
Evidence: The FTX trial is bigger than Sam Bankman-Fried - “This trial is going to be an excruciating moment for the industry because no one knows what kind of evidence might come out.” Bankman-Fried, the founder of FTX and Alameda Research, is facing seven counts of crimi

In [7]:
import json

# Load train.json for comparison
with open('train.json', 'r') as train_file:
    train_data = json.load(train_file)

# Initialize a list to store the new predictions in the same format as `train.json`
output_data = []

# Process each query from train_data
for idx, item in enumerate(train_data[:10]):  # Limiting to first 10 queries
    query = item['query']
    print(f"Processing query {idx+1}: {query}")

    # Step 3: Run the query through the pipeline
    result = hogragger.process_query(query)

    # Step 4: Store result in train.json format
    result_dict = {
        "query": query,
        "answer": result['answer'],
        "question_type": result['question_type'],
        "evidence_list": [
            {
                "title": ev['title'],
                "fact": ev['fact'],
                "source": ev['source'],
                "url": ev['url'],
                "published_at": ev['published_at'],
                "category": ev['category']
            }
            for ev in result['evidence_list'][:]  # Limiting to the first 2 evidence
        ]
    }

    # Append to output data (new predictions)
    output_data.append(result_dict)

# Save new predictions as output.json
with open('output.json', 'w') as output_file:
    json.dump(output_data, output_file, indent=4)

# Compare the predicted results with `train.json`
for idx, (train_item, pred_item) in enumerate(zip(train_data[:10], output_data)):
    print(f"\nComparing query {idx+1}:")
    print(f"Original Train Answer: {train_item.get('answer', 'N/A')}")
    print(f"Predicted Answer: {pred_item['answer']}")

    print(f"Original Question Type: {train_item.get('question_type', 'N/A')}")
    print(f"Predicted Question Type: {pred_item['question_type']}")

    # Compare evidence (if present in train.json)
    train_evidence = train_item.get('evidence_list', [])
    pred_evidence = pred_item['evidence_list']

    for ev_idx, (train_ev, pred_ev) in enumerate(zip(train_evidence[:], pred_evidence)):  # Compare first 2 evidence
        print(f"Evidence {ev_idx+1} (Train): {train_ev.get('title', 'N/A')} - {train_ev.get('fact', 'N/A')}")
        print(f"Evidence {ev_idx+1} (Predicted): {pred_ev['title']} - {pred_ev['fact']}")

    print("\n" + "="*50 + "\n")


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Processing query 1: Who is the individual associated with the cryptocurrency industry facing a criminal trial on fraud and conspiracy charges, as reported by both The Verge and TechCrunch, and is accused by prosecutors of committing fraud for personal gain?
Processing query 2: Which individual is implicated in both inflating the value of a Manhattan apartment to a figure not yet achieved in New York City's real estate history, according to 'Fortune', and is also accused of adjusting this apartment's valuation to compensate for a loss in another asset's worth, as reported by 'The Age'?
Processing query 3: Who is the figure associated with generative AI technology whose departure from OpenAI was considered shocking according to Fortune, and is also the subject of a prevailing theory suggesting a lack of full truthfulness with the board as reported by TechCrunch?
Processing query 4: Do the TechCrunch article on software companies and the Hacker News article on The Epoch Times both report 

In [None]:
import requests

API_URL = "https://api-inference.huggingface.co/models/sentence-transformers/all-MiniLM-L6-v2"
headers = {"Authorization": "Bearer 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"}

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

def find_most_similar_sentence(prompt, sentences):
    # Prepare the API request payload
    payload = {
        "inputs": {
            "source_sentence": prompt,
            "sentences": sentences
        }
    }

    # Get the API response with sentence similarity scores
    output = query(payload)

    # The API returns similarity scores; we find the index of the highest score
    if isinstance(output, list) and output:
        max_score_idx = output.index(max(output))
        most_similar_sentence = sentences[max_score_idx]

        # Return the most similar sentence and its probability score
        return most_similar_sentence, output[max_score_idx]
    else:
        return None, None  # Handle cases where the API response is empty or invalid

# Example usage
prompt = "That is a happy person"
sentences = [
    "That is a happy dog",
    "That is a very happy person",
    "Today is a sunny day"
]

# Get the most similar sentence with the probability score
most_similar_sentence, score = find_most_similar_sentence(prompt, sentences)
if most_similar_sentence:
    print(f"Most similar sentence: '{most_similar_sentence}' with score: {score}")
else:
    print("No valid response from the model.")


No valid response from the model.
