In [None]:
import time
start = time.time()

In [3]:
# RSD
name_core = "RSD"

path_sql_source = '/home/wezh/cores/PM-LLM/Journal/Doc-rsd/SQL-source-cpm'
exclude_dirs = {'obj_dir', 'snapshots', '.git', 'RV32I', 'RV32IM','testbench'}
exclude_files = [] 
sql_doc = 'file-rsd-doc.db'
sql_src = 'file-rsd-src.db'

chromaDB_doc = "./chromaDB-rsd-doc"
chromaDB_src = "./chromaDB-rsd-src"
column_doc = 'rsd-doc'
column_src = 'rsd-src'
csv_final = 'RTL-parameters-final-rsd.csv'

In [None]:
# SweRV
name_core = "SweRV"

path_sql_source = '/home/wezh/cores/PM-LLM/Journal/Doc-swerv/SQL-source-cpm'
exclude_dirs = {'obj_dir', 'snapshots', '.git', 'RV32I', 'RV32IM','testbench'}
exclude_files = [] 
sql_doc = 'file-swerv-doc.db'
sql_src = 'file-swerv-src.db'

chromaDB_doc = "./chromaDB-swerv-doc"
chromaDB_src = "./chromaDB-swerv-src"
column_doc = 'swerv-doc'
column_src = 'swerv-src'
csv_final = 'RTL-parameters-final-swerv.csv'

### --------- Create a sql database containing scr and doc ---------

In [7]:
import os
import sqlite3
import fitz  # PyMuPDF for PDFs
    
# Step 1: Database setup
# Setup database connections
conn_docs = sqlite3.connect(sql_doc)
conn_src = sqlite3.connect(sql_src)
cursor_docs = conn_docs.cursor()
cursor_src = conn_src.cursor()

# Create tables in both databases
cursor_docs.execute('''
    CREATE TABLE IF NOT EXISTS files (
        id INTEGER PRIMARY KEY,
        filename TEXT,
        filepath TEXT,
        filetype TEXT,
        content TEXT
    )
''')
conn_docs.commit()

cursor_src.execute('''
    CREATE TABLE IF NOT EXISTS files (
        id INTEGER PRIMARY KEY,
        filename TEXT,
        filepath TEXT,
        filetype TEXT,
        content TEXT
    )
''')
conn_src.commit()

def clear_database():
    cursor_docs.execute("DELETE FROM files")
    conn_docs.commit()
    cursor_src.execute("DELETE FROM files")
    conn_src.commit()

# File Processing
def is_binary(file_path):
    """
    Returns True if the given file is binary.
    """
    try:
        with open(file_path, 'rb') as file:
            chunk = file.read(1024)  # Read the first 1024 bytes of the file
        textchars = bytearray({7,8,9,10,12,13,27} | set(range(0x20, 0x100)) - {0x7f})
        return bool(chunk.translate(None, textchars))
    except Exception as e:
        print(f"Error checking if file is binary: {str(e)}")
        return True  # Assume any file that cannot be read is binary


# Step 2: Function to extract text from PDFs
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    doc.close()
    return text

# Step 3: Function to read content from SystemVerilog files
def read_text_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()


# Step 4: Function to explore files and populate database
def explore_and_populate(root_folder):
    # exclude_dirs = exclude_dirs
    # exclude_files = ['/home/wezh/cores/Cores-SweRV/LICENSE'] 
    for root, dirs, files in os.walk(root_folder, topdown=True):
        # dirs[:] = [d for d in dirs if d not in exclude_dirs]  # Modify dirs in place to exclude directories
        dirs[:] = [d for d in dirs if d not in exclude_dirs]
        for file in files:
            file_path = os.path.join(root, file)
            
            # Check if the file is in the exclude list
            # if file_path in exclude_files:
                # print(f"Skipped excluded file: {file_path}")
                # continue
                
            # Skip binary files
            if is_binary(file_path):
                continue
                
            if any(file.endswith(ext) for ext in ['.sv', '.txt', '.v', '.md', '.c', '.cpp', '.h', '.pm', 'Makefile']) or '.' not in file:
                try:
                    if file.endswith('.md'):
                        content = read_text_from_file(file_path)
                        filetype = 'Markdown'
                        cursor = cursor_docs
                        connection = conn_docs
                    elif file.endswith('.txt'):
                        content = read_text_from_file(file_path)
                        filetype = 'text'
                        cursor = cursor_docs
                        connection = conn_docs
                    elif file.endswith('.c') or file.endswith('.cpp'):
                        content = read_text_from_file(file_path)
                        filetype = 'C/C++'
                        cursor = cursor_src
                        connection = conn_src
                    elif file.endswith('.h'):
                        content = read_text_from_file(file_path)
                        filetype = 'Header'
                        cursor = cursor_src
                        connection = conn_src
                    elif file.endswith('.pm') or (file.endswith('.pl') or 'Makefile' in file or ('.' not in file and 'Makefile' not in file)):
                        content = read_text_from_file(file_path)
                        filetype = 'Perl/Makefile/Script'
                        cursor = cursor_src
                        connection = conn_src
                    else:
                        content = read_text_from_file(file_path)
                        filetype = 'SystemVerilog' if file.endswith('.sv') else 'Verilog'
                        cursor = cursor_src
                        connection = conn_src
                    
                    cursor.execute('''
                        INSERT INTO files (filename, filepath, filetype, content)
                        VALUES (?, ?, ?, ?)
                    ''', (file, file_path, filetype, content))
                    connection.commit()
                    # print(f"Inserted and committed file: {file_path}")  # Print file path
                except Exception as e:
                    print(f"Failed to process {file_path}: {str(e)}")

# Additional Functions for Querying and Showing Database Contents
def print_file_paths():
    cursor_src.execute("SELECT filepath FROM files")
    paths_src = cursor_src.fetchall()  # Fetch results from the source code database

    cursor_docs.execute("SELECT filepath FROM files")
    paths_docs = cursor_docs.fetchall()  # Fetch results from the documents database

    all_paths = paths_src + paths_docs # Combine results from both fetches
    for path in all_paths:
        print(path[0])

def show_database_contents():
    cursor_docs.execute("SELECT * FROM files")
    rows = cursor_docs.fetchall()
    print(rows)
    for row in rows:
        print(row)

# Step 5: Start the exploration and population process
clear_database()
explore_and_populate(path_sql_source)
# print_file_paths()  # Print all file paths
# show_database_contents()  # Display all contents of the database

# Step 6: Close database connection
conn_docs.close()
conn_src.close()

### --------- Create a sql database containing scr and doc ---------