In [3]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [4]:
import os

# Load data from .md files in the "files" folder
def load_md_files(input_dir):
    docs = []
    for root, _, files in os.walk(input_dir):  # Walk through directory
        for file in files:
            if file.endswith(".md"):  # Only load .md files
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    docs.append({
                        "filename": file,
                        "content": f.read()
                    })
    return docs

# Specify the input directory
input_dir = './files'
docs = load_md_files(input_dir)

# Verify loaded files
print(f"Loaded {len(docs)} Markdown files.")


Loaded 1 Markdown files.


In [4]:
# testing
print(docs[:3])

[{'filename': 'Readme.md', 'content': '# Project Workflow Documentation\n\nThis documentation provides a detailed overview of the code and instructions on how to use it. The code is a Flask application that integrates with MongoDB and OpenAI\'s GPT model to manage a project workflow.\n\n![API calls](image.png)\n*Figure 1: API calls*\n## Table of Contents\n\n1. [Introduction](#introduction)\n2. [Prerequisites](#prerequisites)\n3. [Installation](#installation)\n4. [Configuration](#configuration)\n5. [Usage](#usage)\n6. [Endpoints](#endpoints)\n7. [Code Explanation](#code-explanation)\n8. [Contributing](#contributing)\n9. [License](#license)\n\n## Introduction\n\nThis Flask application is designed to manage a project workflow by interacting with a MongoDB database and OpenAI\'s GPT model. It provides endpoints to initialize a project, ask questions, generate user stories, and generate development plans.\n\n## Prerequisites\n\nBefore you begin, ensure you have met the following requirement

In [8]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Document, Settings
from openai import OpenAI
import numpy as np
from dotenv import load_dotenv

# Load environment variables from .env
load_dotenv()

# Initialize OpenAI client
client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY")  # Automatically uses the environment variable
)

# Define a reusable embedding function using the new OpenAI client
class OpenAIEmbedModel:
    def __init__(self, client, model_name="text-embedding-ada-002"):
        self.client = client
        self.model_name = model_name

    def embed(self, texts):
        if isinstance(texts, str):
            texts = [texts]  # Wrap single string in a list
        try:
            response = self.client.embeddings.create(
                model=self.model_name,
                input=texts
            )
            # Extract embeddings for each text
            embeddings = [data['embedding'] for data in response['data']]
            return np.array(embeddings)  # Convert to numpy array
        except Exception as e:
            print(f"Error generating embeddings: {e}")
            return None

# Initialize the embedding model with the new client
embed_model = OpenAIEmbedModel(client=client)

# Load documents from Markdown files
docs = SimpleDirectoryReader("./files", required_exts=[".md"]).load_data()

# Create a list to store document embeddings
doc_embeddings = []

# Loop through each Markdown document
for doc in docs:
    doc_text = doc.text  # Use the .text attribute to access the content

    # Generate embeddings for the document text
    embedding = embed_model.embed(doc_text)

    if embedding is not None:
        # Store the embedding for this document
        doc_embeddings.append({
            "filename": doc.filename,  # Access filename attribute
            "embedding": embedding
        })

# Confirm embeddings were generated
print(f"Generated embeddings for {len(doc_embeddings)} documents.")



Error generating embeddings: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
Generated embeddings for 0 documents.


In [None]:
# Now `doc_embeddings` contains the embeddings for each document
# print(doc_embeddings)

In [7]:
# Now, use the generated embeddings to create the vector store index
from llama_index.core import VectorStoreIndex

# Create the VectorStoreIndex with the document embeddings
index = VectorStoreIndex.from_embeddings(doc_embeddings)

# The index is now ready for querying
print("Index created and ready for querying.")

AttributeError: type object 'VectorStoreIndex' has no attribute 'from_embeddings'

In [None]:
#Query Engine
from llama_index.llms.openai import OpenAI as LLM_OpenAI
from llama_index.core import Settings

# Setting up the OpenAI LLM (using OpenAI's API)
llm = LLM_OpenAI(model="gpt-4o-mini", request_timeout=300.0)

# Specify the LLM to be used in the settings
Settings.llm = llm

# Setup a query engine on the index previously created (assumes `index` is already defined)
query_engine = index.as_query_engine(streaming=True, similarity_top_k=4)


In [None]:
from llama_index.llms.openai import OpenAI as LLM_OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

# Set up OpenAI as the LLM (using OpenAI's API)
llm = LLM_OpenAI(model="gpt-4o-mini", request_timeout=300.0)

# Set up the OpenAI embedding model
embed_model = OpenAIEmbedding(model_name="text-embedding-ada-002")

# Load documents
documents = SimpleDirectoryReader("./files").load_data()

# Create the index with the OpenAI embedding model
index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)

# Create the query engine with OpenAI as the LLM
query_engine = index.as_query_engine(llm=llm)

# Query the engine
response = query_engine.query("Give me the detailed aim of the project as seen from the markdown.")

# Print the response
print(response)
