In [5]:
!pip install tiktoken GitPython networkx dnspython together python-docx tqdm sentence-transformers qdrant-client boto3



In [6]:
!pip install GitPython networkx pymongo dnspython together python-docx tqdm



In [9]:
import os
import networkx as nx
from pymongo import MongoClient
from bson.objectid import ObjectId
from together import Together
from docx import Document
from IPython.display import FileLink, display
import zipfile
import shutil
import textwrap
from tqdm import tqdm
import json
import time
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import uuid
import tiktoken
import boto3

In [10]:
# AWS S3 setup
s3_client = boto3.client('s3', region_name='eu-west-1')  # Replace 'your-region' with your AWS region
bucket_name = 'database-chat-bot'  # Replace with your S3 bucket name

In [11]:
# Qdrant setup
qdrant_client = QdrantClient(
    url="2326c616-40cd-47a3-8fc6-8d4af01f967f.europe-west3-0.gcp.cloud.qdrant.io:6333",
    api_key="PgBY4_bv-ldy8sZDZn8fjO8lmjdqrVrRJGUcilyHm_D1mWQW_1UDcA",
)
collection_name = "code-embeddings"  # Updated collection name

# Create collection if it doesn't exist
try:
    qdrant_client.get_collection(collection_name)
except:
    qdrant_client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=SentenceTransformer('all-MiniLM-L6-v2').get_sentence_embedding_dimension(), distance=Distance.COSINE),
    )

# Together API setup
os.environ["TOGETHER_API_KEY"] = "8bd4319b6303de31a2363eb3f20dc8276144ed78ee22248d4740ed0ed44b8084"
together_client = Together(api_key=os.environ.get("TOGETHER_API_KEY"))

print("Together API initialized successfully.")

Together API initialized successfully.


In [12]:
# Custom JSON encoder to handle ObjectId
class CustomJSONEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, ObjectId):
            return str(obj)
        return super().default(obj)

In [94]:
def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def get_documentation(content, tree_structure, context="", max_tokens=4000):
    if content is None:
        return "No content available to document."

    prompt_template = """
    Act as a senior software engineer. Generate the full documentation of the Code provided to you below. The documentation must be exhaustive, factual, usage-oriented, and easy to understand by non-technical readers. Use plain text formatting with clear section titles and indentation. Do not make up any responses.

    Template:

    1. Introduction
       - File Name
       - File Subject
       - File Relative Path
       - File Version Information (if applicable)

    2. Functional Overview
       - Purpose and usage description of the code
       - Role of the code in the system
       - Key Features and Workflow
       - Example Use Cases (if applicable)

    3. Technical Details
       - Language, Framework, and External Dependencies
       - Key Components and Marker Interfaces
       - Entity Classes and Key Methods
       - Data Sources
       - Performance Considerations

    4. Architecture
       - Design Pattern and Overall Architecture
       - Data Flow
       - Integration Points
       - Security Considerations
       - Scalability and Performance
       - Exception mechanisms, Error Handling, and Logging

    Code:
    {code}

    File Tree Structure:
    {tree_structure}

    Context:
    {context}
    """

    max_input_tokens = 7000  # Leave some room for the response
    prompt_tokens = num_tokens_from_string(prompt_template)
    available_tokens = max_input_tokens - prompt_tokens

    # Allocate tokens for each section
    tree_tokens = min(1000, int(available_tokens * 0.2))
    context_tokens = min(1000, int(available_tokens * 0.2))
    content_tokens = available_tokens - tree_tokens - context_tokens

    truncated_content = content[:content_tokens]
    truncated_tree = json.dumps(tree_structure, cls=CustomJSONEncoder)[:tree_tokens]
    truncated_context = context[:context_tokens]

    prompt = prompt_template.format(code=truncated_content, tree_structure=truncated_tree, context=truncated_context)

    retries = 3
    while retries > 0:
        try:
            stream = together_client.chat.completions.create(
                model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
                messages=[{"role": "user", "content": prompt}],
                max_tokens=1000,
                stream=True
            )
            response = ""
            for chunk in stream:
                response += chunk.choices[0].delta.content or ""
            return response
        except Exception as e:
            print(f"Error: {str(e)}. Retrying... ({retries} attempts left)")
            retries -= 1
            time.sleep(5)  # Wait for 5 seconds before retrying

    return "Failed to generate documentation after multiple attempts."

In [95]:
def process_zip_file(zip_file_key):
    local_zip_path = '/content/extracted_files.zip'
    temp_dir = '/content/extracted_files'

    # Download the zip file from S3
    download_file_from_s3(zip_file_key, local_zip_path)

    if os.path.exists(temp_dir):
        shutil.rmtree(temp_dir)

    os.makedirs(temp_dir, exist_ok=True)

    # Extract the zip file
    with zipfile.ZipFile(local_zip_path, 'r') as zip_ref:
        zip_ref.extractall(temp_dir)

    print(f"Files extracted to {temp_dir}")

    # Create file tree and store file content
    file_tree = create_file_tree_and_store(temp_dir)
    root = temp_dir
    tree_data = graph_to_dict(file_tree, root)

    # Store tree structure in S3 or another storage if needed
    # Optionally, save tree_data to S3
    # tree_data_json = json.dumps(tree_data, cls=CustomJSONEncoder)
    # s3_client.put_object(Bucket=bucket_name, Key='file_tree.json', Body=tree_data_json)
    
    print("File Tree Structure:")
    print(json.dumps(tree_data, indent=2, cls=CustomJSONEncoder))

    # Process files to create documentation
    docs = process_files_bottom_up(file_tree, tree_data)

    # Save documentation to a file
    with open("/content/final_documentation.md", "w") as f:
        f.write(docs)
    print("Final documentation created and saved to /content/final_documentation.md")

    # Save documentation to a .docx file
    docx_file_path = "/content/final_documentation.docx"
    save_to_docx(docs, docx_file_path)
    display(FileLink(docx_file_path))

    print(docs)

In [96]:
import os
import json
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from together import Together  # Assuming this is the correct import

# Initialize models and clients
model = SentenceTransformer('all-MiniLM-L6-v2')

qdrant_client = QdrantClient(
    url="2326c616-40cd-47a3-8fc6-8d4af01f967f.europe-west3-0.gcp.cloud.qdrant.io:6333",
    api_key="PgBY4_bv-ldy8sZDZn8fjO8lmjdqrVrRJGUcilyHm_D1mWQW_1UDcA",
)
collection_name = "code_embeddings"

together_client = Together(api_key=os.getenv("TOGETHER_API_KEY"))

def get_relevant_content(query, limit=5):
    query_vector = model.encode(query).tolist()
    search_result = qdrant_client.search(
        collection_name=collection_name,
        query_vector=query_vector,
        limit=limit
    )
    return [hit.payload for hit in search_result]

def generate_response(prompt):
    try:
        stream = together_client.chat.completions.create(
            model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=1000,
            stream=True
        )
        response = ""
        for chunk in stream:
            response += chunk.choices[0].delta.content or ""
        return response
    except Exception as e:
        print(f"Error generating response: {str(e)}")
        return "Sorry, I couldn't generate a response."

def chatbot(query):
    # Retrieve relevant content from Qdrant
    relevant_content = get_relevant_content(query)
    
    # Format content for the model
    content_summary = " ".join([item.get('content', '') for item in relevant_content])
    prompt = f"Context: {content_summary}\n\nUser Query: {query}\n\nResponse:"
    
    # Generate response using the language model
    response = generate_response(prompt)
    return response

# Example usage
if __name__ == "__main__":
    user_query = input("You: ")
    response = chatbot(user_query)
    print(f"Bot: {response}")

You:  What is CNN?


Bot: CNN stands for Cable News Network. It is an American news-based pay television channel founded in 1980 by Ted Turner and his company, Turner Broadcasting System (TBS). CNN is one of the first 24-hour cable news channels and has become a leading source of news and information for millions of people around the world.

CNN provides a wide range of news coverage, including breaking news, in-depth analysis, and live coverage of events from around the globe. The channel has a large team of journalists, anchors, and correspondents who work together to bring viewers the latest news and information.

Some of the key features of CNN include:

1. Breaking news coverage: CNN is known for its fast and accurate coverage of breaking news events, such as natural disasters, elections, and major news stories.
2. In-depth analysis: CNN provides in-depth analysis and commentary on news events, helping viewers to understand the context and implications of the news.
3. Live coverage: CNN provides live 