In [1]:
!pip install boto3 together python-docx tqdm sentence-transformers networkx

Collecting together
  Downloading together-1.2.12-py3-none-any.whl.metadata (11 kB)
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.1.0-py3-none-any.whl.metadata (23 kB)
Collecting eval-type-backport<0.3.0,>=0.1.3 (from together)
  Downloading eval_type_backport-0.2.0-py3-none-any.whl.metadata (2.2 kB)
Collecting numpy>=1.23.5 (from together)
  Downloading numpy-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Collecting pydantic<3.0.0,>=2.6.3 (from together)
  Downloading pydantic-2.9.1-py3-none-any.whl.metadata (146 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.0/147.0 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
Collecting typer<0.13,>=0.9 (from together)
  Downloading typer-0.12.5-py3-none-any.

In [2]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.7.0


In [3]:
import os
import boto3
import networkx as nx
from together import Together
from docx import Document
import zipfile
import shutil
import textwrap
from tqdm import tqdm
import json
import time
from sentence_transformers import SentenceTransformer
import uuid
import tiktoken

# AWS S3 setup
s3 = boto3.client('s3')

# Together API setup
os.environ["TOGETHER_API_KEY"] = "8bd4319b6303de31a2363eb3f20dc8276144ed78ee22248d4740ed0ed44b8084"
together_client = Together(api_key=os.environ.get("TOGETHER_API_KEY"))

# Sentence Transformer setup
model = SentenceTransformer('all-MiniLM-L6-v2')

def create_file_tree(root_dir):
    G = nx.DiGraph()
    for dirpath, dirnames, filenames in os.walk(root_dir):
        dirnames[:] = [d for d in dirnames if not d.startswith('.')]
        G.add_node(dirpath, type='directory')
        parent = os.path.dirname(dirpath)
        if parent != dirpath:  # skip root
            G.add_edge(parent, dirpath)
        for filename in filenames:
            if not filename.startswith('.'):
                file_path = os.path.join(dirpath, filename)
                G.add_node(file_path, type='file')
                G.add_edge(dirpath, file_path)
    return G

def graph_to_dict(graph, root):
    node_data = graph.nodes[root]
    if node_data['type'] == 'file':
        return {"path": root, "type": "file"}
    else:
        return {
            "path": root,
            "type": "directory",
            "children": [graph_to_dict(graph, child) for child in graph.successors(root)]
        }

def get_leaves(G):
    return [node for node, degree in G.out_degree() if degree == 0]

def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def get_documentation(content, tree_structure, context="", max_tokens=4000):
    if not content:
        return "No content available to document."
    
    prompt_template = """
    Act as a senior software engineer. Generate the full documentation of the Code provided to you below. Write the best documentation following the Template below. The documentation must have the following qualities: exhaustive, factual, user-oriented and easy to understand by non-technical readers.

    Template:
    1. File Name and Subject
    2. Project Functional Overview
       - Purpose
       - Key Features
       - Workflow
    3. Technical Details
       - Language, Framework and External Dependencies
       - Key Components and Marker interfaces
       - Entity Classes and Key Methods
       - Data Sources
       - Performance Considerations
    4. Architecture
       - Design Pattern and Overall Architecture
       - Data Flow
       - Integration Points
       - Security Considerations
       - Scalability and Performance
       - Exception mechanisms, Error Handling and Logging

    Code: {code}
    File Tree Structure: {tree_structure}
    Context: {context}
    """

    max_input_tokens = 7000  # Leave some room for the response
    prompt_tokens = num_tokens_from_string(prompt_template)
    available_tokens = max_input_tokens - prompt_tokens

    # Allocate tokens for each section
    tree_tokens = min(1000, int(available_tokens * 0.2))
    context_tokens = min(1000, int(available_tokens * 0.2))
    content_tokens = available_tokens - tree_tokens - context_tokens

    truncated_content = content[:content_tokens]
    truncated_tree = json.dumps(tree_structure)[:tree_tokens]
    truncated_context = context[:context_tokens]

    prompt = prompt_template.format(code=truncated_content, tree_structure=truncated_tree, context=truncated_context)

    retries = 3
    while retries > 0:
        try:
            stream = together_client.chat.completions.create(
                model="meta-llama/Llama-3-8b-chat-hf",
                messages=[{"role": "user", "content": prompt}],
                max_tokens=1000,
                stream=True
            )
            response = ""
            for chunk in stream:
                response += chunk.choices[0].delta.content or ""
            return response
        except Exception as e:
            print(f"Error: {str(e)}. Retrying... ({retries} attempts left)")
            retries -= 1
            time.sleep(5)  # Wait for 5 seconds before retrying

    return "Failed to generate documentation after multiple attempts."

def process_files_bottom_up(G, tree_structure):
    leaves = get_leaves(G)
    processed = set()
    context = ""
    master_doc = ""
    file_count = 1

    with tqdm(total=len(G.nodes()), desc="Processing files") as pbar:
        while leaves:
            new_leaves = []
            for leaf in leaves:
                if leaf in processed:
                    continue
                try:
                    node_data = G.nodes[leaf]
                    node_type = node_data.get('type', 'unknown')
                    if node_type == 'file':
                        with open(leaf, 'r', errors='ignore') as file:
                            content = file.read()
                        doc = get_documentation(content, tree_structure, context)
                        file_header = f"\n\n{'#' * 20}\n# File {file_count}: {os.path.basename(leaf)}\n{'#' * 20}\n\n"
                        context += f"{file_header}{doc}\n"
                        file_count += 1
                    elif node_type == 'directory':
                        context += f"\n\nDirectory: {leaf}\n"
                    else:
                        context += f"\n\nUnknown node type: {leaf}\n"
                    processed.add(leaf)
                except Exception as e:
                    print(f"Error processing {node_type} {leaf}: {str(e)}")
                finally:
                    parent = next(G.predecessors(leaf), None)
                    G.remove_node(leaf)
                    if parent and parent not in processed and parent not in new_leaves:
                        new_leaves.append(parent)
                pbar.update(1)
            leaves = new_leaves

    chunk_size = 4000
    context_chunks = textwrap.wrap(context, chunk_size)
    master_doc_chunks = []
    for chunk in context_chunks:
        doc_chunk = get_documentation("", tree_structure, chunk)
        master_doc_chunks.append(doc_chunk)
    master_doc = "\n\n".join(master_doc_chunks)
    return master_doc

def save_to_docx(text, file_path):
    doc = Document()
    doc.add_paragraph(text)
    doc.save(file_path)

def process_zip_file(zip_file_path, output_bucket, output_key):
    temp_dir = '/tmp/extracted_files'
    if os.path.exists(temp_dir):
        shutil.rmtree(temp_dir)
    os.makedirs(temp_dir, exist_ok=True)

    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(temp_dir)
    print(f"Files extracted to {temp_dir}")

    file_tree = create_file_tree(temp_dir)
    root = temp_dir
    tree_data = graph_to_dict(file_tree, root)
    print("File Tree Structure:")
    print(json.dumps(tree_data, indent=2))

    docs = process_files_bottom_up(file_tree, tree_data)

    # Save documentation to a temporary file
    temp_file_path = '/tmp/final_documentation.docx'
    save_to_docx(docs, temp_file_path)

    # Upload the file to S3
    s3.upload_file(temp_file_path, output_bucket, output_key)
    print(f"Final documentation uploaded to s3://{output_bucket}/{output_key}")

    # Clean up temporary files
    os.remove(temp_file_path)
    shutil.rmtree(temp_dir)

    return docs

def lambda_handler(event, context):
    input_bucket = event['input_bucket']
    input_key = event['input_key']
    output_bucket = event['output_bucket']
    output_key = event['output_key']

    # Download the zip file from S3
    zip_file_path = '/tmp/input.zip'
    s3.download_file(input_bucket, input_key, zip_file_path)

    # Process the zip file
    docs = process_zip_file(zip_file_path, output_bucket, output_key)

    return {
        'statusCode': 200,
        'body': json.dumps('Documentation generated successfully')
    }

# Example usage
if __name__ == "__main__":
    event = {
        'input_bucket': 'database-chat-bot',
        'input_key': 'vigor-dev-goandev.zip',
        'output_bucket': 'database-chat-bot-output',
        'output_key': 'final_documentation.docx'
    }
    
    result = lambda_handler(event, None)
    print(result)

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Files extracted to /tmp/extracted_files
File Tree Structure:
{
  "path": "/tmp/extracted_files",
  "type": "directory",
  "children": [
    {
      "path": "/tmp/extracted_files/vigor-dev-goandev",
      "type": "directory",
      "children": [
        {
          "path": "/tmp/extracted_files/vigor-dev-goandev/docker-compose.yml",
          "type": "file"
        },
        {
          "path": "/tmp/extracted_files/vigor-dev-goandev/Dockerfile",
          "type": "file"
        },
        {
          "path": "/tmp/extracted_files/vigor-dev-goandev/Makefile",
          "type": "file"
        },
        {
          "path": "/tmp/extracted_files/vigor-dev-goandev/README.md",
          "type": "file"
        },
        {
          "path": "/tmp/extracted_files/vigor-dev-goandev/Vagrantfile",
          "type": "file"
        },
        {
          "path": "/tmp/extracted_files/vigor-dev-goandev/app",
          "type": "directory",
          "children": [
            {
              "path":

Processing files: 100%|██████████| 1414/1414 [35:05<00:00,  1.49s/it]


Final documentation uploaded to s3://database-chat-bot-output/final_documentation.docx
{'statusCode': 200, 'body': '"Documentation generated successfully"'}
