<a href="https://colab.research.google.com/github/Tm-ui/ImagePromptAnalysis/blob/main/Midjourney_Scrape_A_(HTML_Parsing_and_Data_Extraction).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Documentation:
# This notebook extracts data from an Midjourney Chat as a HTML file while anonymizing sensitive information
# such as user IDs, mentions, and URLs to protect privacy for analysis.



In [None]:
# Import necessary libraries
import pandas as pd
from bs4 import BeautifulSoup
import re
import uuid





In [None]:
# Reads the HTML file (this example assumes that the file is named 'Midjourney_example.html' in the same directory)
with open('Midjourney_example.html', 'r', encoding='utf-8') as f:
    html_string = f.read()

# Parse the HTML using BeautifulSoup
soup = BeautifulSoup(html_string, 'html.parser')

# Lists to store extracted data
user_ids = []
timestamps = []
message_prompts = []
image_urls = []
mentions = []

# Dictionary to replace actual usernames with pseudonyms
users = {}
user_count = 0

# Loop through each message in the chat log
for message in soup.find_all('div', class_='chatlog__message'):
    # Anonymize User IDs
    user_elem = message.find('span', class_='chatlog__author')
    if user_elem and 'data-user-id' in user_elem.attrs:
        user_id = str(uuid.uuid4())  # Replace user_id with anonymized UUID
    else:
        user_id = None
    user_ids.append(user_id)

    # Replace actual timestamps with generalized ones
    timestamp_elem = message.find('span', class_='chatlog__timestamp')
    if timestamp_elem:
        timestamp = 'anonymized_timestamp'
    else:
        timestamp = None
    timestamps.append(timestamp)

    # Extract message prompts and anonymize mentions
    message_elem = message.find('span', class_='chatlog__markdown-preserve')
    if message_elem:
        message_prompt = message_elem.text.strip()
        # Extract '@' mentions and anonymize them
        mention_pattern = r'@([^\s]+)'
        mention_matches = re.findall(mention_pattern, message_prompt)
        pseudonyms = []
        for mention in mention_matches:
            if mention not in users:
                user_count += 1
                users[mention] = f'user_{user_count}'
            pseudonyms.append(users[mention])
        mentions.append(pseudonyms)
        # Remove '@' mentions from message prompt
        message_prompt = re.sub(mention_pattern, '', message_prompt)
    else:
        message_prompt = None
        mentions.append(None)
    message_prompts.append(message_prompt)

    # Anonymize image URLs
    image_elem = message.find('img', class_='chatlog__avatar')
    if image_elem and 'src' in image_elem.attrs:
        image_url = 'url_redacted'  # Replace actual image URLs
    else:
        image_url = None
    image_urls.append(image_url)

# Create a DataFrame from the extracted and anonymized data
data = {
    'user_id': user_ids,
    'timestamp': timestamps,
    'message_prompt': message_prompts,
    'image_url': image_urls,
    'mentions': mentions
}
df = pd.DataFrame(data)

# Display the DataFrame
print(df)




In [None]:
# Save the DataFrame to a CSV file (use a general file name, not something tied to the real data source)
df.to_csv('chatlog_anonymized.csv', index=False)