In [1]:
import os
import sqlite3
import pandas as pd
import json
import re
from docx import Document
import glob

# Set the path to the SQLite files
folder_path = r"C:\Users\Wren\Documents\SW Fall 2024\researchers\Round 3"

# Function to rename files by adding .sqlite extension
def add_sqlite_extension(file_path):
    if not file_path.endswith('.sqlite'):
        new_file_path = file_path + '.sqlite'
        os.rename(file_path, new_file_path)
        print(f"Renamed '{file_path}' to '{new_file_path}'")
        return new_file_path
    return file_path

# Function to safely parse JSON with error handling
def safe_from_json(json_text):
    try:
        return json.loads(json_text)
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON: {e}")
        return None

# List all files in the directory and add .sqlite extension if needed
all_files = glob.glob(os.path.join(folder_path, '*'))
sqlite_files = [add_sqlite_extension(file) for file in all_files]

# Function to extract text from JSON
def extract_text_from_json(json_data):
    try:
        # Parse the JSON data
        parsed_data = safe_from_json(json_data)
        if parsed_data is None:
            return ""

        # Check if 'paragraphs' key is in the JSON data
        if 'paragraphs' not in parsed_data:
            print("No 'paragraphs' key found in JSON data.")
            return ""

        paragraphs = parsed_data['paragraphs']
        text_values = []
        
        # Flag to indicate the start key is found
        start_found = False
        
        # Iterate through paragraphs to find the start key and extract text
        for paragraph in paragraphs:
            if isinstance(paragraph, dict):
                if not start_found:
                    # Check for the start key
                    if paragraph.get('start_tab') == "Your Essay":
                        start_found = True
                else:
                    # Extract text if the start key has been found
                    if 'text' in paragraph:
                        text_values.append(paragraph['text'])

        # Join all text values with newlines
        all_text = '\n'.join(text_values)
        return all_text.strip()

    except Exception as e:
        print(f"Error extracting text from JSON: {e}")
        return ""

# Function to extract JSON data from SQLite database
def extract_data_from_db(db_path):
    try:
        # Connect to the SQLite database
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()

        # Query to select the JSON data from the 'final' entries
        query = "SELECT json FROM document WHERE kind = 'final'"
        cursor.execute(query)

        # Fetch all the results
        rows = cursor.fetchall()

        # Extract and concatenate text from JSON entries
        all_text = ""
        for row in rows:
            json_data = row[0]
            extracted_text = extract_text_from_json(json_data)
            all_text += extracted_text + "\n"

        # Close the database connection
        conn.close()

        return all_text.strip()

    except Exception as e:
        print(f"Error reading from database {db_path}: {e}")
        return ""

# Function to save extracted text to a .txt file
def save_text_to_file(text, output_file_path):
    try:
        with open(output_file_path, 'w', encoding='utf-8') as file:
            file.write(text)
        print(f"Saved extracted text to {output_file_path}")
    except Exception as e:
        print(f"Error writing to file: {e}")

# Process each SQLite file and output a .txt file with the same name
for file in sqlite_files:
    file_name = os.path.basename(file).replace('.sqlite', '')  # Get file name without extension
    output_file_path = os.path.join(folder_path, f"{file_name}.txt")  # Output .txt file path

    all_text = extract_data_from_db(file)  # Extract text from the database
    if all_text:
        save_text_to_file(all_text, output_file_path)  # Save text to .txt file


Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\Round 3\7300-anon-0xp5jiBpdI' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\Round 3\7300-anon-0xp5jiBpdI.sqlite'
Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\Round 3\7300-anon-1BbY8bTQLj' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\Round 3\7300-anon-1BbY8bTQLj.sqlite'
Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\Round 3\7300-anon-28ehCEFhy0' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\Round 3\7300-anon-28ehCEFhy0.sqlite'
Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\Round 3\7300-anon-2c8jjwnV85' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\Round 3\7300-anon-2c8jjwnV85.sqlite'
Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\Round 3\7300-anon-3IkdMFfjhn' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\Round 3\7300-anon-3IkdMFfjhn.sqlite'
Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\Round 3\7300-anon-4btQbTSLKG' to 'C:\Users\Wren\Docu

Saved extracted text to C:\Users\Wren\Documents\SW Fall 2024\researchers\Round 3\7300-anon-uvPJbrZrzQ.txt
Saved extracted text to C:\Users\Wren\Documents\SW Fall 2024\researchers\Round 3\7300-anon-V88kCZA8PX.txt
Saved extracted text to C:\Users\Wren\Documents\SW Fall 2024\researchers\Round 3\7300-anon-vgdq6SUA9R.txt
Saved extracted text to C:\Users\Wren\Documents\SW Fall 2024\researchers\Round 3\7300-anon-vWPi61W6pM.txt
Saved extracted text to C:\Users\Wren\Documents\SW Fall 2024\researchers\Round 3\7300-anon-wb5NwbmFfO.txt
Saved extracted text to C:\Users\Wren\Documents\SW Fall 2024\researchers\Round 3\7300-anon-Wy8adTzqjd.txt
Saved extracted text to C:\Users\Wren\Documents\SW Fall 2024\researchers\Round 3\7300-anon-xSdBgQ3YhK.txt
Saved extracted text to C:\Users\Wren\Documents\SW Fall 2024\researchers\Round 3\7300-anon-XzX7pRFZhh.txt
Saved extracted text to C:\Users\Wren\Documents\SW Fall 2024\researchers\Round 3\7300-anon-y7YmclPrp1.txt
Saved extracted text to C:\Users\Wren\Document