In [11]:
import os
import sqlite3
import pandas as pd
import json
import re
from docx import Document
import glob

# Set the path to the SQLite files
folder_path = "C:/Users/Wren/Documents/SW Fall 2024/students"

# List all files in the directory and add .sqlite extension if needed
all_files = glob.glob(os.path.join(folder_path, '*'))
sqlite_files = [add_sqlite_extension(file) for file in all_files]

# Function to rename files by adding .sqlite extension
def add_sqlite_extension(file_path):
    if not file_path.endswith('.sqlite'):
        new_file_path = file_path + '.sqlite'
        os.rename(file_path, new_file_path)
        print(f"Renamed '{file_path}' to '{new_file_path}'")
        return new_file_path
    return file_path

# Function to safely parse JSON with error handling
def safe_from_json(json_text):
    try:
        return json.loads(json_text)
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON: {e}")
        return None

# Function to extract text from JSON
def extract_text_from_json(json_data):
    try:
        # Parse the JSON data
        parsed_data = safe_from_json(json_data)
        if parsed_data is None:
            return ""

        # Check if 'paragraphs' key is in the JSON data
        if 'paragraphs' not in parsed_data:
            print("No 'paragraphs' key found in JSON data.")
            return ""

        paragraphs = parsed_data['paragraphs']
        text_values = []
        
        # Flag to indicate the start key is found
        start_found = False
        
        # Iterate through paragraphs to find the start key and extract text
        for paragraph in paragraphs:
            if isinstance(paragraph, dict):
                if not start_found:
                    # Check for the start key
                    if paragraph.get('start_tab') == "Your Essay":
                        start_found = True
                else:
                    # Extract text if the start key has been found
                    if 'text' in paragraph:
                        text_values.append(paragraph['text'])

        # Join all text values with newlines
        all_text = '\n'.join(text_values)
        return all_text.strip()

    except Exception as e:
        print(f"Error extracting text from JSON: {e}")
        return ""

# Function to extract data from SQLite
def extract_data_from_db(db_file_path):
    print(f"Processing file: {db_file_path}")
    
    try:
        # Check if the file is a valid SQLite database
        if not is_valid_sqlite(db_file_path):
            print(f"Invalid SQLite file: {db_file_path}")
            return None

        # Connect to the SQLite database
        conn = sqlite3.connect(db_file_path)
        
        try:
            # Query the 'props' table to get the 'value' for 'metadata'
            props_query = "SELECT value FROM props WHERE name = 'metadata'"
            props_df = pd.read_sql_query(props_query, conn)
            
            if props_df.empty:
                print(f"No metadata found in {db_file_path}.")
                return None

            metadata_value = props_df['value'][0]  # Assuming only one entry for metadata
            metadata = safe_from_json(metadata_value)
            if metadata is None:
                print(f"Error extracting participant from metadata in {db_file_path}.")
                return None

            participant = metadata.get('participant', 'Unknown')

            # Query the 'props' table to get the 'value' for 'token'
            props_query = "SELECT value FROM props WHERE name = 'token'"
            props_df = pd.read_sql_query(props_query, conn)
            
            if props_df.empty:
                print(f"No token found in {db_file_path}.")
                return None

            token_value = props_df['value'][0]
            print(f"Raw token_value: {token_value}")

            # Use the token_value directly
            token = token_value

            # Query the 'document' table to get the 'final' JSON information
            document_query = "SELECT json FROM document WHERE kind = 'final'"
            document_df = pd.read_sql_query(document_query, conn)
            
            if document_df.empty:
                print(f"No final JSON information found in {db_file_path}.")
                return None

            # Extract the 'final' JSON information
            final_json = document_df['json'][0]  # Assuming only one entry for final
            extracted_text = extract_text_from_json(final_json)
            
            return pd.DataFrame({
                'participant': [participant],
                'token': [token],
                'extracted_text': [extracted_text]
            })
        
        except Exception as e:
            print(f"Error during extraction from {db_file_path}: {e}")
            return None
        
        finally:
            conn.close()

    except Exception as e:
        print(f"Failed to process file {db_file_path}: {e}")
        return None

# Combine data from all files into a single DataFrame
combined_df = pd.DataFrame()

for file in sqlite_files:
    data_df = extract_data_from_db(os.path.join(folder_path, file))
    if data_df is not None:
        combined_df = pd.concat([combined_df, data_df], ignore_index=True)

# Remove specified participants
excluded_participants = {'evgeny', 'jmgodber', 'sas14', 'csaxman'}
combined_df = combined_df[~combined_df['participant'].isin(excluded_participants)]

# Keep only the entry with the most text for each participant
combined_df['text_length'] = combined_df['extracted_text'].apply(len)
combined_df = combined_df.loc[combined_df.groupby('participant')['text_length'].idxmax()]

# Drop the 'text_length' column as it's no longer needed
combined_df = combined_df.drop(columns=['text_length'])

# Add 'address' column with email addresses
combined_df['address'] = combined_df['participant'] + "@iastate.edu"

# Add 'file' column with student text file name
combined_df['file'] = combined_df['participant'] + ".docx"

# Export each extracted_text to a .docx file and update the DataFrame
for index, row in combined_df.iterrows():
    participant_name = row['participant']
    
    # Sanitize the participant name for file name usage
    participant_name = re.sub(r'[<>:"/\\|?*]', '', participant_name)
    
    # Create a Word document
    doc = Document()
    doc.add_paragraph(row['extracted_text'])
    
    # Name the output file with the participant's name
    output_path = os.path.join(folder_path, f"{participant_name}.docx")
    doc.save(output_path)
    print(f"Saved {output_path}")
    
# Drop the 'extracted_text' column as it's no longer needed
combined_df = combined_df.drop(columns=['extracted_text'])

# Save the updated DataFrame with the new 'address' column to CSV
csv_output_file = os.path.join(folder_path, "combined_output_with_address.csv")
combined_df.to_csv(csv_output_file, index=False)
print(f"Combined file written to: {csv_output_file}")


Renamed 'C:/Users/Wren/Documents/SW Fall 2024/students\7300-m0mfzclu-JnCvY9DYcL' to 'C:/Users/Wren/Documents/SW Fall 2024/students\7300-m0mfzclu-JnCvY9DYcL.sqlite'
Renamed 'C:/Users/Wren/Documents/SW Fall 2024/students\7300-m0mfzia8-iGx0jYXgjL' to 'C:/Users/Wren/Documents/SW Fall 2024/students\7300-m0mfzia8-iGx0jYXgjL.sqlite'
Renamed 'C:/Users/Wren/Documents/SW Fall 2024/students\7300-m0mg0poy-R587WzLcM3' to 'C:/Users/Wren/Documents/SW Fall 2024/students\7300-m0mg0poy-R587WzLcM3.sqlite'
Renamed 'C:/Users/Wren/Documents/SW Fall 2024/students\7300-m0mg0q7l-HXbpS5U8zs' to 'C:/Users/Wren/Documents/SW Fall 2024/students\7300-m0mg0q7l-HXbpS5U8zs.sqlite'
Renamed 'C:/Users/Wren/Documents/SW Fall 2024/students\7300-m0mg0q84-kZtBQCEoFl' to 'C:/Users/Wren/Documents/SW Fall 2024/students\7300-m0mg0q84-kZtBQCEoFl.sqlite'
Renamed 'C:/Users/Wren/Documents/SW Fall 2024/students\7300-m0mg0qlm-to9oAU6KMj' to 'C:/Users/Wren/Documents/SW Fall 2024/students\7300-m0mg0qlm-to9oAU6KMj.sqlite'
Renamed 'C:/User

Raw token_value: 7300-m0mgiqm1-gLjE3l32c7
Processing file: C:/Users/Wren/Documents/SW Fall 2024/students\7300-m0mgir11-ySXBz5Zxvr.sqlite
Raw token_value: 7300-m0mgir11-ySXBz5Zxvr
Processing file: C:/Users/Wren/Documents/SW Fall 2024/students\7300-m0mgirla-uVsCIMy40Y.sqlite
Raw token_value: 7300-m0mgirla-uVsCIMy40Y
Processing file: C:/Users/Wren/Documents/SW Fall 2024/students\7300-m0mgissd-OgHq9tDPez.sqlite
Raw token_value: 7300-m0mgissd-OgHq9tDPez
Processing file: C:/Users/Wren/Documents/SW Fall 2024/students\7300-m0mgit0e-WlhsqFcyoP.sqlite
Raw token_value: 7300-m0mgit0e-WlhsqFcyoP
Processing file: C:/Users/Wren/Documents/SW Fall 2024/students\7300-m0mgit5r-E7O8NQy8qD.sqlite
Raw token_value: 7300-m0mgit5r-E7O8NQy8qD
Processing file: C:/Users/Wren/Documents/SW Fall 2024/students\7300-m0mh0jzr-Ja2YNb1uiJ.sqlite
Raw token_value: 7300-m0mh0jzr-Ja2YNb1uiJ
Processing file: C:/Users/Wren/Documents/SW Fall 2024/students\7300-m0mh0k0f-CJIZaV9iYE.sqlite
Raw token_value: 7300-m0mh0k0f-CJIZaV9iYE

Raw token_value: 7300-m0pe7766-gn949hDWfV
Processing file: C:/Users/Wren/Documents/SW Fall 2024/students\7300-m0pe77d3-uJQE669eiP.sqlite
Raw token_value: 7300-m0pe77d3-uJQE669eiP
Processing file: C:/Users/Wren/Documents/SW Fall 2024/students\7300-m0pe77da-8Q5e9HDanI.sqlite
Raw token_value: 7300-m0pe77da-8Q5e9HDanI
Processing file: C:/Users/Wren/Documents/SW Fall 2024/students\7300-m0pe77sz-1B0vZR8zmW.sqlite
Raw token_value: 7300-m0pe77sz-1B0vZR8zmW
Processing file: C:/Users/Wren/Documents/SW Fall 2024/students\7300-m0phemz4-Zz7q9RsSlB.sqlite
Raw token_value: 7300-m0phemz4-Zz7q9RsSlB
Processing file: C:/Users/Wren/Documents/SW Fall 2024/students\7300-m0phen21-XcAzMMOEBp.sqlite
Raw token_value: 7300-m0phen21-XcAzMMOEBp
Processing file: C:/Users/Wren/Documents/SW Fall 2024/students\7300-m0phenmm-E97SH4JQST.sqlite
Raw token_value: 7300-m0phenmm-E97SH4JQST
Processing file: C:/Users/Wren/Documents/SW Fall 2024/students\7300-m0pheofg-bhZyRBeXbU.sqlite
Raw token_value: 7300-m0pheofg-bhZyRBeXbU

Saved C:/Users/Wren/Documents/SW Fall 2024/students\ryleel.docx
Saved C:/Users/Wren/Documents/SW Fall 2024/students\salbenav.docx
Saved C:/Users/Wren/Documents/SW Fall 2024/students\saraho.docx
Saved C:/Users/Wren/Documents/SW Fall 2024/students\sbehn.docx
Saved C:/Users/Wren/Documents/SW Fall 2024/students\sophiamq.docx
Saved C:/Users/Wren/Documents/SW Fall 2024/students\srs31.docx
Saved C:/Users/Wren/Documents/SW Fall 2024/students\step.docx
Saved C:/Users/Wren/Documents/SW Fall 2024/students\takniss.docx
Saved C:/Users/Wren/Documents/SW Fall 2024/students\vikh06.docx
Saved C:/Users/Wren/Documents/SW Fall 2024/students\woo79447.docx
Combined file written to: C:/Users/Wren/Documents/SW Fall 2024/students\combined_output_with_address.csv
