In [12]:
import pandas as pd
from pandas import json_normalize
import numpy as np
from langdetect import detect, LangDetectException

# Define file path
file_path = '../../data/snapshot_20230831/20230831_061759_issue_sharings.json'

# Load JSON file into a DataFrame
df = pd.read_json(file_path)

# Normalize 'Sources' JSON data
df = json_normalize(df['Sources'])
#print(f"Initial data: {df.info()}")  # Check data structure
print(f"Initial data rows: {len(df)}")


# Drop irrelevant columns
columns_to_drop = ['Type', 'Author']
df.drop(columns=columns_to_drop, errors='ignore', inplace=True)

# Rename columns for clarity
df.rename(columns={
    "Title": "issueTitle",
    "URL": "sourceURL",
    "Body": "issueDesc",
    "Number": "numPrompts"
}, inplace=True)
#print(f"Columns renamed: {df.info()}")

# Replace NaN values in 'ChatgptSharing' with empty lists
df['ChatgptSharing'] = df['ChatgptSharing'].apply(lambda x: x if isinstance(x, list) else [])

# Explode 'ChatgptSharing' column and normalize JSON
chatgpt_sharing = json_normalize(df['ChatgptSharing'].explode())
print(f"After exploding 'ChatgptSharing': {len(chatgpt_sharing)} rows")


# Propagate columns from the parent DataFrame to the exploded DataFrame
columns_to_propagate = df.columns.difference(['ChatgptSharing'])
for col in columns_to_propagate:
    chatgpt_sharing[col] = df[col].repeat(df['ChatgptSharing'].apply(len)).reset_index(drop=True)

# Add conversation_id
chatgpt_sharing['conversation_id'] = chatgpt_sharing['Title'] + '_' + chatgpt_sharing['Mention.MentionedURL']
print(f"Unique conversation IDs: {chatgpt_sharing['conversation_id'].nunique()}")

# Validate data propagation
#print(chatgpt_sharing.info())

# Drop irrelevant columns
columns_to_drop = [
    'Status', 'DateOfConversation', 'DateOfAccess', 'NumberOfPrompts', 'TokensOfPrompts', 
    'TokensOfAnswers', 'Model', 'HTMLContent', 'URL', 'Mention.MentionedURL', 
    'Mention.MentionedAuthor'
]
chatgpt_sharing.drop(columns=columns_to_drop, errors='ignore', inplace=True)

# Rename columns for clarity
chatgpt_sharing.rename(columns={
    'Title': 'conversationTitle',
    'Mention.MentionedProperty': 'mentionProperty',
    'Mention.MentionedText': 'mentionText'
}, inplace=True)

# Replace NaN values in 'Conversations' with empty lists
chatgpt_sharing['Conversations'] = chatgpt_sharing['Conversations'].apply(lambda x: x if isinstance(x, list) else [])

# Explode 'Conversations' column and normalize JSON
conversations = json_normalize(chatgpt_sharing['Conversations'].explode())
print(f"After exploding 'Conversations': {len(conversations)} rows")


# Propagate columns from 'chatgpt_sharing' to 'conversations'
for col in chatgpt_sharing.columns.difference(['Conversations']):
    conversations[col] = chatgpt_sharing[col].repeat(chatgpt_sharing['Conversations'].apply(len)).reset_index(drop=True)

# Drop rows with null 'Prompt' and 'Answer'
conversations = conversations[~(conversations['Prompt'].isnull() & conversations['Answer'].isnull())]
#print(f"Filtered conversations: {conversations.info()}")
print(f"Filtered conversations: {len(conversations)} rows")


# Detect language in 'Prompt' column
def detect_language(prompt):
    try:
        return detect(prompt)
    except LangDetectException:
        return None

# Apply language detection
conversations['Detected_Language'] = conversations['Prompt'].apply(detect_language)
print(f"Language detection applied. Null languages: {conversations['Detected_Language'].isnull().sum()}")


# Filter for English conversations
mode_languages = conversations.groupby('conversation_id')['Detected_Language'].agg(lambda x: x.mode()[0] if not x.mode().empty else None)
english_conversations = mode_languages[mode_languages == 'en'].index.tolist()
conversations = conversations[conversations['conversation_id'].isin(english_conversations)]
print(f"English conversations: {len(conversations)} rows, {conversations['conversation_id'].nunique()} unique IDs")



Initial data rows: 353
After exploding 'ChatgptSharing': 417 rows
Unique conversation IDs: 373
After exploding 'Conversations': 1780 rows
Filtered conversations: 1747 rows
Language detection applied. Null languages: 6
English conversations: 1438 rows, 314 unique IDs


In [13]:
print(chatgpt_sharing['conversation_id'].value_counts())  # Comparar en ambos scripts
print(conversations['Prompt'].isnull().sum())  # Comparar en ambos scripts
print(conversations['Detected_Language'].value_counts())  # Comparar en ambos scripts
print(mode_languages)  # Comparar en ambos scripts


conversation_id
AHC Treatment Compounds_https://github.com/NCATSTranslator/Feedback/issues/198#issuecomment-1570587853                   6
AHC Treatment Compounds List_https://github.com/NCATSTranslator/Feedback/issues/198#issuecomment-1570587853              3
Minority Language Data Suggestions_https://github.com/sillsdev/languageforge-lexbox/issues/60#issuecomment-1665792771    2
Minority Language Data Site_https://github.com/sillsdev/languageforge-lexbox/issues/60#issuecomment-1665792771           2
Minority Language Data Suggestions_https://github.com/sillsdev/languageforge-lexbox/issues/60#issuecomment-1669728877    2
                                                                                                                        ..
Bean Retrieval: Class Details_https://github.com/jabrena/spring-boot-user-beans/issues/36                                1
Spring Class Package_https://github.com/jabrena/spring-boot-user-beans/issues/36#issuecomment-1595903216                 1


In [None]:
#print(chatgpt_sharing['Mention.MentionedURL'].isnull().sum())
print(mode_languages.value_counts())


Detected_Language
en       314
ja        26
ko        13
de         4
it         2
ca         2
nl         2
zh-cn      2
so         1
et         1
vi         1
cs         1
Name: count, dtype: int64
