In [None]:
import pandas as pd
from fastapi import FastAPI
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct, VectorParams, Distance
from langchain import LLMChain, PromptTemplate
from langchain.llms import HuggingFacePipeline
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import uuid
import uvicorn
import nest_asyncio

# Apply nest_asyncio for running in Jupyter
nest_asyncio.apply()

# Initialize FastAPI app
app = FastAPI()

# Initialize Qdrant clientF
client = QdrantClient("localhost", port=6333)

# Load SentenceTransformer model
MODEL = SentenceTransformer("multi-qa-mpnet-base-dot-v1")

# Load data and create collection
df = pd.read_csv('output_embeddings.csv')
text_column_name = 'text'

# Recreate collection in Qdrant
client.recreate_collection(
    collection_name="similar_text",
    vectors_config=VectorParams(size=768, distance=Distance.COSINE),
)

# Insert data into Qdrant
for index, row in df.iterrows():
    text = row[text_column_name]
    text_embeddings = MODEL.encode(text).tolist()
    id = str(uuid.uuid4())
    payload = {"text": text, "text_embeddings": text_embeddings}
    client.upsert(
        collection_name="similar_text",
        wait=True,
        points=[PointStruct(id=id, vector=text_embeddings, payload=payload)]
    )

# Define the input schema for similarity search
class Query(BaseModel):
    question: str

@app.post("/search")
def search_similar_text(query: Query):
    question_embeddings = MODEL.encode(query.question).tolist()
    search_result = client.search(
        collection_name="similar_text",
        query_vector=question_embeddings,
        limit=3
    )
    results = [
        {"text": result.payload["text"], "similarity_score": result.score}
        for result in search_result
    ]
    return results

In [None]:
from fastapi.responses import JSONResponse

# Initialize GPT-Neo model (same setup as before)
model_name = "EleutherAI/gpt-neo-2.7B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id)

generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=150,          # Restrict response length
    temperature=0.2,             # Lower randomness for concise answers
    repetition_penalty=1.5,      # Increase penalty to reduce repetition
)
huggingface_llm = HuggingFacePipeline(pipeline=generator)

prompt = PromptTemplate(
    template=(
        "Context: {context} "
        "Question: {question} Answer:"
    ),
    input_variables=["context", "question"]
)

llm_chain = LLMChain(prompt=prompt, llm=huggingface_llm)

class LLMQuery(BaseModel):
    question: str

@app.post("/llm_query")
def llm_query(query: LLMQuery):
    question_embeddings = MODEL.encode(query.question).tolist()
    # Get context from Qdrant to focus the answer
    search_result = client.search(
        collection_name="similar_text",
        query_vector=question_embeddings,
        limit=1 # the Limit to most relevant context item
    )
    context = " ".join([item.payload["text"] for item in search_result]) if search_result else ""
    
    answer = llm_chain.run(context=context, question=query.question)
     
    # Clean up answer formatting for readability
    answer = answer.replace("\n", "").strip()

    # Return JSON response
    response = {
        "question": query.question,
        "context": context,
        "answer": answer
    }
    return JSONResponse(content=response, headers={"Content-Type": "application/json"})

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)

In [None]:
from fastapi.responses import JSONResponse
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate

# Initialize GPT-Neo model as before
model_name = "EleutherAI/gpt-neo-2.7B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id)

generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=150,
    temperature=0.5,
    repetition_penalty=1.5,
)
huggingface_llm = HuggingFacePipeline(pipeline=generator)

# Set up ChatPromptTemplate
system_message = SystemMessagePromptTemplate.from_template(
        "You are an assistant for question-answering tasks."
        "Based on the context provided, generate a concise answer in three sentences maximum. "
        "When the generated answer is finished, end it without repeating anything. "
        "Context: {context} "
        "Question: {question} Answer:"
)
chat_prompt = ChatPromptTemplate.from_messages([system_message])

llm_chain = LLMChain(prompt=chat_prompt, llm=huggingface_llm)

class LLMQuery(BaseModel):
    question: str

@app.post("/llm_query")
def llm_query(query: LLMQuery):
    question_embeddings = MODEL.encode(query.question).tolist()
    # Retrieve context from Qdrant
    search_result = client.search(
        collection_name="similar_text",
        query_vector=question_embeddings,
        limit=2
    )
    context = " ".join([item.payload["text"] for item in search_result]) if search_result else ""
    
    # Generate answer with ChatPromptTemplate
    answer = llm_chain.run(context=context, question=query.question)

    # Clean up formatting to truncate any repetitions
    answer = answer.replace("\n", "").strip()

    # Return JSON response
    response = {
        "question": query.question,
        "context": context,
        "answer": answer
    }
    return JSONResponse(content=response, headers={"Content-Type": "application/json"})

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)

In [None]:
from fastapi import FastAPI
from pydantic import BaseModel
from fastapi.responses import JSONResponse
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
import uvicorn

app = FastAPI()

# Initialize DistilBERT QA model
model_name = "distilbert-base-uncased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Set up the QA pipeline
qa_pipeline = pipeline(
    "question-answering",
    model=model,
    tokenizer=tokenizer
)

class LLMQuery(BaseModel):
    question: str

@app.post("/llm_query")
def llm_query(query: LLMQuery):
    # Generate embeddings and search for context from Qdrant (mock example)
    question_embeddings = MODEL.encode(query.question).tolist()
    search_result = client.search(
        collection_name="similar_text",
        query_vector=question_embeddings,
        limit=2
    )
    # Combine context from search results
    context = " ".join([item.payload["text"] for item in search_result]) if search_result else ""
    
    # Get the answer from the QA pipeline
    qa_input = {
        "question": query.question,
        "context": context
    }
    answer = qa_pipeline(qa_input)
    
    # Prepare the response with the answer text
    response = {
        "question": query.question,
        "context": context,
        "answer": answer['answer']
    }
    return JSONResponse(content=response, headers={"Content-Type": "application/json"})

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)

In [None]:
from fastapi import FastAPI
from pydantic import BaseModel
from fastapi.responses import JSONResponse
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
import uvicorn

app = FastAPI()

# Initialize RoBERTa QA model
model_name = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Set up the QA pipeline
qa_pipeline = pipeline(
    "question-answering",
    model=model,
    tokenizer=tokenizer,
    max_answer_len=100
)

class LLMQuery(BaseModel):
    question: str

@app.post("/llm_query")
def llm_query(query: LLMQuery):
    # Generate embeddings and search for context from Qdrant (mock example)
    question_embeddings = MODEL.encode(query.question).tolist()
    search_result = client.search(
        collection_name="similar_text",
        query_vector=question_embeddings,
        limit=2
    )
    # Combine context from search results
    context = " ".join([item.payload["text"] for item in search_result]) if search_result else ""
    
    # Get the answer from the QA pipeline
    qa_input = {
        "question": query.question,
        "context": context
    }
    answer = qa_pipeline(qa_input)
    
    # Prepare the response with the answer text
    response = {
        "question": query.question,
        "context": context,
        "answer": answer['answer']
    }
    return JSONResponse(content=response, headers={"Content-Type": "application/json"})

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)

In [None]:
from fastapi import FastAPI
from pydantic import BaseModel
from fastapi.responses import JSONResponse
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import uvicorn

app = FastAPI()

# Initialize GPT-2 model
model_name = "gpt2-medium"  # You could also try "gpt2-medium" for slightly better performance
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id)

# Set up text generation pipeline with tuned parameters for conciseness
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=100,            # Keep answer length concise
    temperature=0.3,               # Control randomness
    repetition_penalty=1.3         # Penalize repetition
)

class LLMQuery(BaseModel):
    question: str

@app.post("/llm_query")
def llm_query(query: LLMQuery):
    # Generate embeddings and search for context from Qdrant (mock example)
    question_embeddings = MODEL.encode(query.question).tolist()
    search_result = client.search(
        collection_name="similar_text",
        query_vector=question_embeddings,
        limit=2
    )
    # Combine context from search results
    context = " ".join([item.payload["text"] for item in search_result]) if search_result else ""
    
    # Create a prompt combining context and question
    prompt = f"Context: {context}\nQuestion: {query.question}\nAnswer:"

    # Generate answer using GPT-2
    answer = generator(prompt)[0]["generated_text"]

    # Post-process answer: Cut off after generating two "Answer:" statements if present
    answer_split = answer.split("Answer:")
    if len(answer_split) > 2:
        answer = "Answer:" + answer_split[1].strip()

    # Prepare the response with the answer text
    response = {
        "question": query.question,
        "context": context,
        "answer": answer
    }
    return JSONResponse(content=response, headers={"Content-Type": "application/json"})

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)

In [None]:
from fastapi import FastAPI
from pydantic import BaseModel
from fastapi.responses import JSONResponse
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import uvicorn

app = FastAPI()

# Initialize T5 model
model_name = "t5-base"  # Alternatively, you can use "t5-base" or larger variants
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Set up a text generation pipeline with T5's seq2seq structure for QA
generator = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=100,               # Keeps answers concise
    repetition_penalty=1.2,      # Helps reduce repetitive answers
)

class LLMQuery(BaseModel):
    question: str

@app.post("/llm_query")
def llm_query(query: LLMQuery):
    # Generate embeddings and search for context from Qdrant (mock example)
    question_embeddings = MODEL.encode(query.question).tolist()
    search_result = client.search(
        collection_name="similar_text",
        query_vector=question_embeddings,
        limit=2
    )
    # Combine context from search results
    context = " ".join([item.payload["text"] for item in search_result]) if search_result else ""
    
    # Create a T5-style prompt combining context and question
    prompt = f"question: {query.question} context: {context}"

    # Generate the answer with T5
    answer = generator(prompt)[0]["generated_text"]

    # Prepare the response with the answer text
    response = {
        "question": query.question,
        "context": context,
        "answer": answer
    }
    return JSONResponse(content=response, headers={"Content-Type": "application/json"})

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)

In [None]:
from fastapi import FastAPI
from pydantic import BaseModel
from fastapi.responses import JSONResponse
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import uvicorn

app = FastAPI()

# Initialize MPT-7B-Chat model
model_name = "mosaicml/mpt-7b-chat" #too heavy to use for this case and requirements #NOTE
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Set up a text generation pipeline for question answering
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=100,         # Keep answers concise
    do_sample=True, 
    repetition_penalty=1.2,     # Reduces repetitive answers
)

class LLMQuery(BaseModel):
    question: str

@app.post("/llm_query")
def llm_query(query: LLMQuery):
    # Generate embeddings and search for context from Qdrant
    question_embeddings = MODEL.encode(query.question).tolist()
    search_result = client.search(
        collection_name="similar_text",
        query_vector=question_embeddings,
        limit=2
    )
    # Combine context from search results
    context = " ".join([item.payload["text"] for item in search_result]) if search_result else ""
    
    # Formulate the input for MPT-7B-Chat
    prompt = f"Based on the context provided, answer concisely: {context} Question: {query.question} Answer:"

    # Generate answer
    answer = generator(prompt)[0]["generated_text"]

    # Process answer to remove redundancies
    response = {
        "question": query.question,
        "context": context,
        "answer": answer
    }
    return JSONResponse(content=response, headers={"Content-Type": "application/json"})

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)

In [None]:
from fastapi import FastAPI
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
import uvicorn

app = FastAPI()

# Initialize the generative model - DistilGPT-2
model_name = "distilgpt2" #changed from distilgpt2 to distilbert/distilgpt2
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=150,
    temperature=0.7,
    repetition_penalty=1.5,
)

# Initialize the embedding model for Qdrant (with 768-dimensional embeddings)
embedding_model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")

# Connect to Qdrant
qdrant_client = QdrantClient("localhost", port=6333)

class LLMQuery(BaseModel):
    question: str

@app.post("/llm_query")
def llm_query(query: LLMQuery):
    # Step 1: Embed the question and perform similarity search in Qdrant
    question_embeddings = embedding_model.encode(query.question).tolist()
    search_result = client.search(
        collection_name="similar_text",
        query_vector=question_embeddings,
        limit=2  # Adjust to get the top 2 relevant results
    )
    
    # Step 2: Prepare the context from search results
    context = " ".join([result.payload["text"] for result in search_result]) if search_result else ""
    
    # Step 3: Generate the answer using the generative model
    input_text = f"Context: {context} Question: {query.question} Answer:"
    generated_text = generator(input_text, num_return_sequences=1)
    
    # Step 4: Extract and clean up the generated answer
    answer = generated_text[0]["generated_text"].replace(input_text, "").strip()

    # Return JSON response
    response = {
        "question": query.question,
        "context": context,
        "answer": answer
    }
    return JSONResponse(content=response, headers={"Content-Type": "application/json"})

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)

In [None]:
from fastapi import FastAPI
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from transformers import T5ForConditionalGeneration, T5Tokenizer, pipeline
from qdrant_client import QdrantClient
from sentence_transformers import SentenceTransformer

app = FastAPI()

# Initialize FLAN-T5 model and tokenizer
model_name = "google/flan-t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Set up the sentence-transformers model for creating question embeddings
embedding_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

# Initialize Qdrant client
qdrant_client = QdrantClient("localhost", port=6333)

class LLMQuery(BaseModel):
    question: str

@app.post("/llm_query")
def llm_query(query: LLMQuery):
    question_embedding = embedding_model.encode(query.question).tolist()
    search_result = qdrant_client.search(
        collection_name="similar_text",
        query_vector=question_embedding,
        limit=2  # Adjust based on how much context you want
    )
    
    # Step 3: Prepare the context from search results
    context = " ".join([result.payload["text"] for result in search_result]) if search_result else ""
    
    # Step 4: Generate the answer using FLAN-T5
    input_text = f"Context: {context} Question: {query.question} Answer:"
    inputs = tokenizer(input_text, return_tensors="pt")
    outputs = model.generate(inputs["input_ids"], max_length=150, num_beams=5, early_stopping=True)
    
    # Decode the generated answer
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Return JSON response
    response = {
        "question": query.question,
        "context": context,
        "answer": answer
    }
    return JSONResponse(content=response, headers={"Content-Type": "application/json"})

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)

In [None]:
from fastapi import FastAPI
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
import uvicorn

app = FastAPI()

# Load the Meta-Llama model
model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=150,
    temperature=0.7,
    repetition_penalty=1.1,
)

# Load the embedding model for Qdrant
embedding_model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")

# Connect to Qdrant
qdrant_client = QdrantClient("localhost", port=6333)

class LLMQuery(BaseModel):
    question: str

@app.post("/llm_query")
def llm_query(query: LLMQuery):
    # Step 1: Encode the question and search in Qdrant
    question_embeddings = embedding_model.encode(query.question).tolist()
    search_results = qdrant_client.search(
        collection_name="similar_text",
        query_vector=question_embeddings,
        limit=2
    )
    
    # Step 2: Combine the top contexts
    context = " ".join([result.payload["text"] for result in search_results]) if search_results else "No relevant context found."
    
    # Step 3: Generate text using the Meta-Llama model
    input_prompt = f"Context: {context} Question: {query.question} Answer:"
    generated_text = generator(input_prompt, num_return_sequences=1)
    
    # Step 4: Clean and prepare the response
    answer = generated_text[0]["generated_text"].replace(input_prompt, "").strip()
    
    response = {
        "question": query.question,
        "context": context,
        "answer": answer
    }
    return JSONResponse(content=response, headers={"Content-Type": "application/json"})

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)