In [33]:
# Install necessary libraries
!pip install openai pandas

# Import necessary libraries
from openai import OpenAI
import pandas as pd

# Initialize OpenAI client
client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")

# Function to get embeddings
def get_embedding(text, model="nomic-ai/nomic-embed-text-v1.5-GGUF"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding

# Load the Q&A data
with open('Q&A_format.md', 'r', encoding='utf-8') as file:
    data = file.read()

# Manually split the document based on headers
questions_answers = data.split("Question: ")

# Process the split data into a structured format
qa_pairs = []
for qa in questions_answers[1:]:  # Skipping the first empty split
    parts = qa.split("Answer: ")
    question = parts[0].strip()
    answer = parts[1].strip() if len(parts) > 1 else ""
    qa_pairs.append({"question": question, "answer": answer})

# Convert to DataFrame
df = pd.DataFrame(qa_pairs)

# Get embeddings for questions
df['question_embedding'] = df['question'].apply(lambda x: get_embedding(x))

# Save the embeddings and QA pairs for future use
df.to_csv('qa_embeddings.csv', index=False)

import numpy as np
from scipy.spatial.distance import cosine

# Function to find the most similar question
def find_most_similar_question(query, df, model="nomic-ai/nomic-embed-text-v1.5-GGUF"):
    query_embedding = get_embedding(query, model)
    df['similarity'] = df['question_embedding'].apply(lambda x: 1 - cosine(query_embedding, x))
    most_similar_idx = df['similarity'].idxmax()
    return df.iloc[most_similar_idx]

# Function to detect if the query is a coding request
def is_coding_request(query):
    coding_keywords = ['code', 'script', 'program', 'function', 'class']
    return any(keyword in query.lower() for keyword in coding_keywords)

# Function to generate code using the model
def generate_code(prompt, model="nomic-ai/nomic-embed-text-v1.5-GGUF"):
    response = client.completions.create(
        model=model,
        prompt=prompt,
        max_tokens=200,
        temperature=0.5
    )
    return response.choices[0].text.strip()

# Function to get answer based on query
def get_answer(query, df):
    if is_coding_request(query):
        return generate_code(query)
    else:
        most_similar_qa = find_most_similar_question(query, df)
        return most_similar_qa['answer']

# Example usage
query = "What is the meaning of life?"
answer = get_answer(query, df)
print(f"Query: {query}\nAnswer: {answer}")

Query: What is the meaning of life?
Answer: The first steps consists in defining the objective function you want to optimize. This is the most important step: you must define an experimentation which is reproducible and which will produce results that can be measured to approximate the function you want to optimize.


In [34]:
# Example usage
query = "What is classification?"
answer = get_answer(query, df)
print(f"Query: {query}\nAnswer: {answer}")

Query: What is classification?
Answer: Classification is the process of sorting objects or concepts into categories based on common characteristics, attributes, or features. In other words, it is the act of grouping similar things together and distinguishing them from other things that are not similar.

In the context of data analysis, classification is a type of supervised learning algorithm used to predict the class label (or category) of new, unseen data based on its features. The goal of classification is to create a model that can accurately assign labels or categories to data points.

Types of Classification:

There are several types of classification techniques, including:

1. **Binary classification**: This type of classification involves predicting whether an object belongs to one of two classes (e.g., spam vs. non-spam emails).
2. **Multi-class classification**: This type of classification involves predicting which class an object belongs to among multiple classes (e.g., diff

In [35]:
query = "Canard?"
answer = get_answer(query, df)
print(f"Query: {query}\nAnswer: {answer}")

Query: Canard?
Answer: It's a method for demonstrating graphically the locality, spread and skewness groups of numerical data through their quartiles. In addition to the box, there can be lines (which are called whiskers) extending from the box indicating variability outside the upper and lower quartiles, thus, the plot is also called the box-and-whisker plot.


In [42]:
query = "Code me a simple reservoir"
answer = get_answer(query, df)
print(f"Query: {query}\nAnswer: {answer}")

Query: Code me a simple reservoir
Answer: simulation model in Python with the following features:
    1. Reservoir properties: permeability, porosity, thickness
    2. Well data: well name, x and y coordinates, depth, production rate
    3. Grid dimensions: number of rows, columns, and layers
    4. Time step and total simulation time

Here is a simple implementation:

```Python
class ReservoirSimulation:
    def __init__(self):
        self.reservoir_properties = {}
        self.well_data = []
        self.grid_dimensions = {}
        self.time_step = None
        self.total_simulation_time = None

    def set_reservoir_properties(self, permeability, porosity, thickness):
        self.reservoir_properties['permeability'] = permeability
        self.reservoir_properties['porosity'] = porosity
        self.reservoir_properties['thickness'] = thickness

    def add_well(self, well_name, x, y, depth, production
