In [1]:
import os
import sqlite3
import pandas as pd
import json
import re  # Import the re module for regular expressions

# Path where files are located
files_directory = r'C:\Users\Wren\Documents\SW Fall 2024\researchers'

# Define participant labels to delete
participant_labels_to_delete = {'evgeny', 'jmgodber', 'csaxman', 'sas14'}

# List to store files to be deleted
files_to_delete = []

# List to store files remaining after deletion
remaining_files = []

# Function to rename files by adding .sqlite extension
def add_sqlite_extension(file_path):
    if not file_path.endswith('.sqlite'):
        new_file_path = file_path + '.sqlite'
        os.rename(file_path, new_file_path)
        print(f"Renamed '{file_path}' to '{new_file_path}'")
        return new_file_path
    return file_path

# Function to safely extract JSON data
def safe_from_json(json_string):
    try:
        return json.loads(json_string)
    except json.JSONDecodeError:
        print("Error decoding JSON.")
        return None

# Function to extract text from JSON
def extract_text_from_json(json_data):
    extracted_text = []
    def extract_text(obj):
        if isinstance(obj, dict):
            for k, v in obj.items():
                if k == 'text':
                    extracted_text.append(v)
                extract_text(v)
        elif isinstance(obj, list):
            for item in obj:
                extract_text(item)
    extract_text(json_data)
    return '\n'.join(extracted_text)

# DataFrame to combine results
combined_df = pd.DataFrame()

# Process each file in the directory
for filename in os.listdir(files_directory):
    file_path = os.path.join(files_directory, filename)
    
    if os.path.isfile(file_path):
        # Rename the file by adding .sqlite extension if needed
        new_file_path = add_sqlite_extension(file_path)
        
        # Connect to the SQLite database
        try:
            conn = sqlite3.connect(new_file_path)
            c = conn.cursor()

            # Query the 'props' table to get the 'value' for 'metadata'
            props_query = "SELECT value FROM props WHERE name = 'metadata'"
            props_df = pd.read_sql_query(props_query, conn)
            
            # Check if 'props_df' has data
            if props_df.empty:
                print("No metadata found in 'props' table.")
                conn.close()
                continue
            
            # Extract the participant 'netid' from the 'metadata'
            metadata_value = props_df['value'][0]  # Assuming only one entry for metadata
            metadata = safe_from_json(metadata_value)
            if metadata is None:
                print("Error extracting participant from metadata.")
                conn.close()
                continue
            participant = metadata.get('participant', 'Unknown')
            
            # Query the 'document' table to get the 'final' JSON information
            document_query = "SELECT json FROM document WHERE kind = 'final'"
            document_df = pd.read_sql_query(document_query, conn)
            
            # Check if 'document_df' has data
            if document_df.empty:
                print("No final JSON information found in 'document' table.")
                conn.close()
                continue
            
            # Extract the 'final' JSON information
            final_json = document_df['json'][0]  # Assuming only one entry for final
            
            # Extract text using the function
            extracted_text = extract_text_from_json(safe_from_json(final_json))
            
            # Create a data frame with participant and extracted text
            temp_df = pd.DataFrame({
                'file_path': [new_file_path],
                'participant': [participant],
                'extracted_text': [extracted_text]
            })
            
            # Combine with the main DataFrame
            combined_df = pd.concat([combined_df, temp_df], ignore_index=True)

            conn.close()
        
        except sqlite3.Error as e:
            print(f"Error accessing SQLite file {new_file_path}: {e}")
        except json.JSONDecodeError:
            print(f"Error decoding JSON in file {new_file_path}")

# Remove files with specified participants and find duplicates
files_to_delete.extend(combined_df[combined_df['participant'].isin(participant_labels_to_delete)]['file_path'].tolist())

# Retain only the file with the most text for each participant
if not combined_df.empty:
    combined_df['text_length'] = combined_df['extracted_text'].apply(len)
    # Get the file with the maximum text length for each participant
    combined_df = combined_df.loc[combined_df.groupby('participant')['text_length'].idxmax()]
    
    # Get the file paths to delete (all other files for each participant)
    file_paths_to_keep = set(combined_df['file_path'])
    for file_path in combined_df['file_path']:
        for other_file_path in os.listdir(files_directory):
            full_path = os.path.join(files_directory, other_file_path)
            if os.path.isfile(full_path) and full_path not in file_paths_to_keep:
                files_to_delete.append(full_path)

# Remove duplicates from files_to_delete
files_to_delete = list(set(files_to_delete))

# Delete the identified files
for file_path in files_to_delete:
    try:
        os.remove(file_path)
        print(f"Deleted file: {file_path}")
    except IOError:
        print(f"Error deleting file: {file_path}")

# Perform additional SQL operations on remaining files
for filename in os.listdir(files_directory):
    file_path = os.path.join(files_directory, filename)
    
    if os.path.isfile(file_path):
        new_file_path = add_sqlite_extension(file_path)
        
        try:
            conn = sqlite3.connect(new_file_path)
            c = conn.cursor()

            # Empty a table in the log
            c.execute("DELETE FROM props;")
            
            # Clear tabs
            c.execute("UPDATE act SET json='{\"is_focused\":false}' WHERE k='focus' AND json LIKE '%is_focused\":false%'")
            
            # Find the lowest value of t for each file
            c.execute("SELECT MIN(t) FROM (SELECT MIN(t) AS t FROM act UNION SELECT MIN(t) AS t FROM key UNION SELECT MIN(t) AS t FROM eye);")
            row = c.fetchone()
            if row is not None:
                lowest_t = row[0]
                
                # Subtract the lowest t from all t values in each file
                c.execute("UPDATE act SET t = t - ? WHERE t >= ?;", (lowest_t, lowest_t))
                c.execute("UPDATE key SET t = t - ? WHERE t >= ?;", (lowest_t, lowest_t))
                c.execute("UPDATE eye SET t = t - ? WHERE t >= ?;", (lowest_t, lowest_t))
                
            conn.commit()  # Commit any pending changes before closing the connection
            conn.close()
        
        except sqlite3.Error as e:
            print(f"Error accessing SQLite file {new_file_path}: {e}")

# Rename log files after processing
for filename in os.listdir(files_directory):
    file_path = os.path.join(files_directory, filename)
    
    if os.path.isfile(file_path):
        current_name = file_path
        new_name = re.sub(r"(\d+-)\w+(-\S+)\.sqlite", r"\1anon\2", filename)
        new_file_path = os.path.join(files_directory, new_name)
        try:
            os.rename(current_name, new_file_path)
            print(f"Renamed '{current_name}' to '{new_file_path}'")
        except Exception as e:
            print(f"Skipping {current_name} - unable to rename file ({e})")

print("All specified operations are complete.")


Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mfzclu-JnCvY9DYcL' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mfzclu-JnCvY9DYcL.sqlite'
Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mfzia8-iGx0jYXgjL' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mfzia8-iGx0jYXgjL.sqlite'
Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mg0poy-R587WzLcM3' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mg0poy-R587WzLcM3.sqlite'
Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mg0q7l-HXbpS5U8zs' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mg0q7l-HXbpS5U8zs.sqlite'
Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mg0q84-kZtBQCEoFl' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mg0q84-kZtBQCEoFl.sqlite'
Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mg0qlm-to9oAU6KMj' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mg0qlm

Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mme7hw-79yknG0SA0' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mme7hw-79yknG0SA0.sqlite'
Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mme7k5-kVUAZGustI' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mme7k5-kVUAZGustI.sqlite'
Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mme7wy-lSwiksgleg' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mme7wy-lSwiksgleg.sqlite'
Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mme7yc-vuBaQsW7r8' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mme7yc-vuBaQsW7r8.sqlite'
Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mme8c8-lSR1nXmm17' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mme8c8-lSR1nXmm17.sqlite'
Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mmeadi-W4rvSzyxlp' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mmeadi

Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0pbjaw2-woLzF2lu95' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0pbjaw2-woLzF2lu95.sqlite'
Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0pe5zah-4hRPWvCYCO' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0pe5zah-4hRPWvCYCO.sqlite'
Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0pe6po9-ExKZSmQpv1' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0pe6po9-ExKZSmQpv1.sqlite'
Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0pe76b9-KaygqXjU7T' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0pe76b9-KaygqXjU7T.sqlite'
Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0pe76rn-Gv3WkYH5yW' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0pe76rn-Gv3WkYH5yW.sqlite'
Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0pe770q-b9FCdU4Avn' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0pe770q

Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mh0jzr-Ja2YNb1uiJ.sqlite' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-anon-Ja2YNb1uiJ'
Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mh0k0f-CJIZaV9iYE.sqlite' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-anon-CJIZaV9iYE'
Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mh0k71-dXSvK2udIw.sqlite' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-anon-dXSvK2udIw'
Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mh0kps-n0mNUD5XSD.sqlite' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-anon-n0mNUD5XSD'
Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mh0l20-doANetIZjL.sqlite' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-anon-doANetIZjL'
Renamed 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-m0mh0l8q-qsuSFJiok9.sqlite' to 'C:\Users\Wren\Documents\SW Fall 2024\researchers\7300-anon-qsuSFJiok9'
Rena