In [17]:
import pandas as pd
from pandas import json_normalize
import numpy as np
from langdetect import detect, LangDetectException
from langdetect.detector_factory import DetectorFactory

# Set a fixed seed for reproducibility
DetectorFactory.seed = 108

def detect_language(prompt):
    """Detects the language of a given text, handling exceptions."""
    try:
        return detect(str(prompt)) if isinstance(prompt, str) else None
    except LangDetectException:
        return None


def load_and_normalize_json(file_path):
    """Loads a JSON file and normalizes its 'Sources' data."""
    df = pd.read_json(file_path)
    df = json_normalize(df['Sources'])
    print(f"Initial data rows: {len(df)}")
    return df


def clean_and_transform_data(df):
    """Cleans and renames columns in the dataset."""
    columns_to_drop = ['Type', 'Author']
    df.drop(columns=columns_to_drop, errors='ignore', inplace=True)
    
    df.rename(columns={
        "Title": "issueTitle",
        "URL": "sourceURL",
        "Body": "issueDesc",
        "Number": "numPrompts"
    }, inplace=True)
    
    df['ChatgptSharing'] = df['ChatgptSharing'].apply(lambda x: x if isinstance(x, list) else [])
    return df


def normalize_and_explode_chatgpt(df):
    """Explodes the 'ChatgptSharing' column and propagates relevant columns."""
    chatgpt_sharing = json_normalize(df['ChatgptSharing'].explode())
    print(f"After exploding 'ChatgptSharing': {len(chatgpt_sharing)} rows")
    
    if 'Title' in chatgpt_sharing.columns and 'Mention.MentionedURL' in chatgpt_sharing.columns:
        chatgpt_sharing['conversation_id'] = chatgpt_sharing['Title'].astype(str) + '_' + chatgpt_sharing['Mention.MentionedURL'].astype(str)
    else:
        chatgpt_sharing['conversation_id'] = "UNKNOWN"
        
    print(f"Unique conversation IDs: {chatgpt_sharing['conversation_id'].nunique()}")
    
    # Propagate columns from the parent DataFrame
    columns_to_propagate = df.columns.difference(['ChatgptSharing'])
    for col in columns_to_propagate:
        chatgpt_sharing[col] = df[col].repeat(df['ChatgptSharing'].apply(len)).reset_index(drop=True)
    
    return chatgpt_sharing


def clean_chatgpt_sharing(chatgpt_sharing):
    """Cleans and renames columns in the exploded ChatGPT sharing data."""
    columns_to_drop = [
        'Status', 'DateOfConversation', 'DateOfAccess', 'NumberOfPrompts', 'TokensOfPrompts', 
        'TokensOfAnswers', 'Model', 'HTMLContent', 'URL', 'Mention.MentionedURL', 
        'Mention.MentionedAuthor'
    ]
    chatgpt_sharing.drop(columns=columns_to_drop, errors='ignore', inplace=True)
    
    chatgpt_sharing.rename(columns={
        'Title': 'conversationTitle',
        'Mention.MentionedProperty': 'mentionProperty',
        'Mention.MentionedText': 'mentionText'
    }, inplace=True)
    
    chatgpt_sharing['Conversations'] = chatgpt_sharing['Conversations'].apply(lambda x: x if isinstance(x, list) else [])
    return chatgpt_sharing


def normalize_and_explode_conversations(chatgpt_sharing):
    """Explodes the 'Conversations' column and propagates relevant columns."""
    conversations = json_normalize(chatgpt_sharing['Conversations'].explode())
    print(f"After exploding 'Conversations': {len(conversations)} rows")
    
    if 'conversation_id' not in conversations.columns:
        conversations['conversation_id'] = chatgpt_sharing['conversation_id'].repeat(chatgpt_sharing['Conversations'].apply(len)).reset_index(drop=True)

    columns_to_propagate = chatgpt_sharing.columns.difference(['Conversations'])
    for col in columns_to_propagate:
        conversations[col] = chatgpt_sharing[col].repeat(chatgpt_sharing['Conversations'].apply(len)).reset_index(drop=True)
    
    return conversations


def filter_english_conversations(conversations):
    """Filters conversations to keep only those in English."""
    conversations['Detected_Language'] = conversations['Prompt'].apply(detect_language)
    print(f"Language detection applied. Null languages: {conversations['Detected_Language'].isnull().sum()}")
    
    if 'conversation_id' in conversations.columns:
        mode_languages = conversations.groupby('conversation_id')['Detected_Language'].agg(lambda x: x.mode()[0] if not x.mode().empty else None)
        english_conversations = mode_languages[mode_languages == 'en'].index.tolist()
        conversations = conversations[conversations['conversation_id'].isin(english_conversations)]
        print(f"English conversations: {len(conversations)} rows, {conversations['conversation_id'].nunique()} unique IDs")
    else:
        print("Error: 'conversation_id' is missing from conversations dataset.")
    
    return conversations


def process_json(file_path):
    """Executes the entire processing pipeline for a given JSON file."""
    df = load_and_normalize_json(file_path)
    df = clean_and_transform_data(df)
    chatgpt_sharing = normalize_and_explode_chatgpt(df)
    chatgpt_sharing = clean_chatgpt_sharing(chatgpt_sharing)
    conversations = normalize_and_explode_conversations(chatgpt_sharing)
    conversations = filter_english_conversations(conversations)
    
    return conversations


# Example usage:
file_path = '../../data/snapshot_20230831/20230831_061759_issue_sharings.json'
processed_conversations = process_json(file_path)

# Now you can use `processed_conversations` for further analysis or save it as a CSV
# processed_conversations.to_csv('cleaned_data.csv', index=False)


Initial data rows: 353
After exploding 'ChatgptSharing': 417 rows
Unique conversation IDs: 406
After exploding 'Conversations': 1780 rows
Language detection applied. Null languages: 39
English conversations: 1455 rows, 313 unique IDs


In [9]:
print(chatgpt_sharing['conversation_id'].value_counts())  
print(conversations['Prompt'].isnull().sum())  
print(conversations['Detected_Language'].value_counts())  
print(mode_languages)  


conversation_id
AHC Treatment Compounds_https://github.com/NCATSTranslator/Feedback/issues/198#issuecomment-1570587853                   6
AHC Treatment Compounds List_https://github.com/NCATSTranslator/Feedback/issues/198#issuecomment-1570587853              3
Minority Language Data Site_https://github.com/sillsdev/languageforge-lexbox/issues/60#issuecomment-1665792771           2
Minority Language Data Suggestions_https://github.com/sillsdev/languageforge-lexbox/issues/60#issuecomment-1669728877    2
Minority Language Data Site_https://github.com/sillsdev/languageforge-lexbox/issues/60#issuecomment-1669728877           2
                                                                                                                        ..
Map NumPy to ctypes_https://github.com/lcompilers/lpython/issues/2276#issuecomment-1677948059                            1
Map NumPy to ctypes_https://github.com/lcompilers/lpython/issues/2276#issuecomment-1677912426                            1


In [16]:
#print(chatgpt_sharing['Mention.MentionedURL'].isnull().sum())
print(mode_languages.value_counts())


Detected_Language
en       315
ja        26
ko        13
ca         3
de         3
it         2
zh-cn      2
so         1
et         1
nl         1
vi         1
cs         1
Name: count, dtype: int64
