In [23]:
# Install necessary libraries
!pip install openai pandas

# Import necessary libraries
from openai import OpenAI
import pandas as pd

# Initialize OpenAI client
client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")

# Function to get embeddings
def get_embedding(text, model="nomic-ai/nomic-embed-text-v1.5-GGUF"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding

# Load the Q&A data
with open('Q&A_format.md', 'r', encoding='utf-8') as file:
    data = file.read()

# Manually split the document based on headers
questions_answers = data.split("Question: ")

# Process the split data into a structured format
qa_pairs = []
for qa in questions_answers[1:]:  # Skipping the first empty split
    parts = qa.split("Answer: ")
    question = parts[0].strip()
    answer = parts[1].strip() if len(parts) > 1 else ""
    qa_pairs.append({"question": question, "answer": answer})

# Convert to DataFrame
df = pd.DataFrame(qa_pairs)

# Get embeddings for questions
df['question_embedding'] = df['question'].apply(lambda x: get_embedding(x))

# Save the embeddings and QA pairs for future use
df.to_csv('qa_embeddings.csv', index=False)

import numpy as np
from scipy.spatial.distance import cosine

# Function to find the most similar question
def find_most_similar_question(query, df, model="nomic-ai/nomic-embed-text-v1.5-GGUF"):
    query_embedding = get_embedding(query, model)
    df['similarity'] = df['question_embedding'].apply(lambda x: 1 - cosine(query_embedding, x))
    most_similar_idx = df['similarity'].idxmax()
    return df.iloc[most_similar_idx]

# Function to get answer based on query
def get_answer(query, df):
    most_similar_qa = find_most_similar_question(query, df)
    return most_similar_qa['answer']

# Example usage
query = "What is the meaning of life?"
answer = get_answer(query, df)
print(f"Query: {query}\nAnswer: {answer}")

Query: What is the meaning of life?
Answer: The first steps consists in defining the objective function you want to optimize. This is the most important step: you must define an experimentation which is reproducible and which will produce results that can be measured to approximate the function you want to optimize.


In [30]:
# Example usage
query = "What is classification?"
answer = get_answer(query, df)
print(f"Query: {query}\nAnswer: {answer}")

Query: What is classification?
Answer: A classification task involves assigning input data to one of several predefined categories or classes. The goal is to predict the category to which new data points belong, based on the training data. Examples include identifying email as spam or not spam, classifying images of animals, or recognizing spoken words.


In [31]:
query = "Canard?"
answer = get_answer(query, df)
print(f"Query: {query}\nAnswer: {answer}")

Query: Canard?
Answer: It's a method for demonstrating graphically the locality, spread and skewness groups of numerical data through their quartiles. In addition to the box, there can be lines (which are called whiskers) extending from the box indicating variability outside the upper and lower quartiles, thus, the plot is also called the box-and-whisker plot.
