## Question

7) How accurately can we predict the length of a conversation with ChatGPT based on the initial prompt and context provided?

### Data Extraction

In [1]:
import pandas as pd
from pandas import json_normalize
import numpy as np
from langdetect import detect, LangDetectException

# Define file path
file_path = '../data/snapshot_20230831/20230831_061759_issue_sharings.json'

# Load JSON file into a DataFrame
df = pd.read_json(file_path)

# Normalize 'Sources' JSON data
df = json_normalize(df['Sources'])
print(f"Initial data: {df.info()}")  # Check data structure

# Verify 'Body' observations in a specific range
for i, row in enumerate(df['Body'][70:76]):
    print(f"Row {i}: {row}")

# Drop irrelevant columns
columns_to_drop = ['Type', 'Author']
df.drop(columns=columns_to_drop, errors='ignore', inplace=True)

# Rename columns for clarity
df.rename(columns={
    "Title": "issueTitle",
    "URL": "sourceURL",
    "Body": "issueDesc",
    "Number": "numPrompts"
}, inplace=True)
print(f"Columns renamed: {df.info()}")

# Replace NaN values in 'ChatgptSharing' with empty lists
df['ChatgptSharing'] = df['ChatgptSharing'].apply(lambda x: x if isinstance(x, list) else [])

# Explode 'ChatgptSharing' column and normalize JSON
chatgpt_sharing = json_normalize(df['ChatgptSharing'].explode())

# Propagate columns from the parent DataFrame to the exploded DataFrame
columns_to_propagate = df.columns.difference(['ChatgptSharing'])
for col in columns_to_propagate:
    chatgpt_sharing[col] = df[col].repeat(df['ChatgptSharing'].apply(len)).reset_index(drop=True)

# Add conversation_id
chatgpt_sharing['conversation_id'] = chatgpt_sharing['Title'] + '_' + chatgpt_sharing['Mention.MentionedURL']

# Validate data propagation
print(chatgpt_sharing.info())

# Drop irrelevant columns
columns_to_drop = [
    'Status', 'DateOfConversation', 'DateOfAccess', 'NumberOfPrompts', 'TokensOfPrompts', 
    'TokensOfAnswers', 'Model', 'HTMLContent', 'URL', 'Mention.MentionedURL', 
    'Mention.MentionedAuthor'
]
chatgpt_sharing.drop(columns=columns_to_drop, errors='ignore', inplace=True)

# Rename columns for clarity
chatgpt_sharing.rename(columns={
    'Title': 'conversationTitle',
    'Mention.MentionedProperty': 'mentionProperty',
    'Mention.MentionedText': 'mentionText'
}, inplace=True)

# Replace NaN values in 'Conversations' with empty lists
chatgpt_sharing['Conversations'] = chatgpt_sharing['Conversations'].apply(lambda x: x if isinstance(x, list) else [])

# Explode 'Conversations' column and normalize JSON
conversations = json_normalize(chatgpt_sharing['Conversations'].explode())

# Propagate columns from 'chatgpt_sharing' to 'conversations'
for col in chatgpt_sharing.columns.difference(['Conversations']):
    conversations[col] = chatgpt_sharing[col].repeat(chatgpt_sharing['Conversations'].apply(len)).reset_index(drop=True)

# Drop rows with null 'Prompt' and 'Answer'
conversations = conversations[~(conversations['Prompt'].isnull() & conversations['Answer'].isnull())]
print(f"Filtered conversations: {conversations.info()}")

# Detect language in 'Prompt' column
def detect_language(prompt):
    try:
        return detect(prompt)
    except LangDetectException:
        return None

# Apply language detection
conversations['Detected_Language'] = conversations['Prompt'].apply(detect_language)

# Filter for English conversations
mode_languages = conversations.groupby('conversation_id')['Detected_Language'].agg(lambda x: x.mode()[0] if not x.mode().empty else None)
english_conversations = mode_languages[mode_languages == 'en'].index.tolist()
conversations = conversations[conversations['conversation_id'].isin(english_conversations)]



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 353 entries, 0 to 352
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Type            353 non-null    object
 1   URL             353 non-null    object
 2   Author          353 non-null    object
 3   RepoName        353 non-null    object
 4   RepoLanguage    320 non-null    object
 5   Number          353 non-null    int64 
 6   Title           353 non-null    object
 7   Body            353 non-null    object
 8   CreatedAt       353 non-null    object
 9   ClosedAt        163 non-null    object
 10  UpdatedAt       353 non-null    object
 11  State           353 non-null    object
 12  ChatgptSharing  353 non-null    object
dtypes: int64(1), object(12)
memory usage: 36.0+ KB
Initial data: None
Row 0: ### Describe the bug

https://www.geoclub.de/forum/t/gc-little-helper-ii-ab-v0-11.81650/post-1360958 und folgende

### To Reproduce

_No response_

### Expec

In [2]:
len(english_conversations)

314

In [3]:
import pandas as pd
from pandas import json_normalize
from langdetect import detect, LangDetectException

# Función para detectar idioma con manejo de excepciones
def detect_language(prompt):
    try:
        return detect(prompt)
    except LangDetectException:
        return None

# Carga y normalización inicial del JSON
def load_and_normalize_json(file_path):
    df = pd.read_json(file_path)
    df = json_normalize(df['Sources'])
    print(f"Dataframe inicial cargado: {df.info()}")
    return df

# Limpieza y renombrado de columnas
def clean_and_rename_columns(df):
    columns_to_drop = ['Type', 'Author']
    df.drop(columns=columns_to_drop, errors='ignore', inplace=True)
    df.rename(columns={
        "Title": "issueTitle",
        "URL": "sourceURL",
        "Body": "issueDesc",
        "Number": "numPrompts"
    }, inplace=True)
    print(f"Columnas después del renombrado: {df.info()}")
    return df

# Normalización y explosión de datos en 'ChatgptSharing'
def normalize_and_explode_chatgpt_sharing(df):
    df['ChatgptSharing'] = df['ChatgptSharing'].apply(lambda x: x if isinstance(x, list) else [])
    chatgpt_sharing = json_normalize(df['ChatgptSharing'].explode())
    
    # Propagar columnas
    columns_to_propagate = df.columns.difference(['ChatgptSharing'])
    for col in columns_to_propagate:
        chatgpt_sharing[col] = df[col].repeat(df['ChatgptSharing'].apply(len)).reset_index(drop=True)
    
    chatgpt_sharing['conversation_id'] = chatgpt_sharing['Title'] + '_' + chatgpt_sharing['Mention.MentionedURL']
    print(f"ChatgptSharing normalizado: {chatgpt_sharing.info()}")
    return chatgpt_sharing

# Limpieza adicional y renombrado
def clean_chatgpt_sharing(chatgpt_sharing):
    columns_to_drop = [
        'Status', 'DateOfConversation', 'DateOfAccess', 'NumberOfPrompts', 'TokensOfPrompts',
        'TokensOfAnswers', 'Model', 'HTMLContent', 'URL', 'Mention.MentionedURL', 
        'Mention.MentionedAuthor'
    ]
    chatgpt_sharing.drop(columns=columns_to_drop, errors='ignore', inplace=True)
    chatgpt_sharing.rename(columns={
        'Title': 'conversationTitle',
        'Mention.MentionedProperty': 'mentionProperty',
        'Mention.MentionedText': 'mentionText'
    }, inplace=True)
    print(f"ChatgptSharing después de limpieza: {chatgpt_sharing.info()}")
    return chatgpt_sharing

# Normalización y explosión de conversaciones
def normalize_and_explode_conversations(chatgpt_sharing):
    chatgpt_sharing['Conversations'] = chatgpt_sharing['Conversations'].apply(lambda x: x if isinstance(x, list) else [])
    conversations = json_normalize(chatgpt_sharing['Conversations'].explode())
    
    # Propagar columnas
    columns_to_propagate = chatgpt_sharing.columns.difference(['Conversations'])
    for col in columns_to_propagate:
        conversations[col] = chatgpt_sharing[col].repeat(chatgpt_sharing['Conversations'].apply(len)).reset_index(drop=True)
    
    # Eliminar filas con 'Prompt' y 'Answer' nulos
    conversations = conversations[~(conversations['Prompt'].isnull() & conversations['Answer'].isnull())]
    print(f"Conversaciones después de limpieza: {conversations.info()}")
    return conversations

# Filtrar conversaciones en inglés
def filter_english_conversations(conversations):
    conversations['Detected_Language'] = conversations['Prompt'].apply(detect_language)
    mode_languages = conversations.groupby('conversation_id')['Detected_Language'].agg(lambda x: x.mode()[0] if not x.mode().empty else None)
    english_conversations = mode_languages[mode_languages == 'en'].index.tolist()
    conversations = conversations[conversations['conversation_id'].isin(english_conversations)]
    print(f"Conversaciones en inglés: {conversations.info()}")
    return conversations

# Función principal
def process_data(file_path):
    df = load_and_normalize_json(file_path)
    df = clean_and_rename_columns(df)
    chatgpt_sharing = normalize_and_explode_chatgpt_sharing(df)
    chatgpt_sharing = clean_chatgpt_sharing(chatgpt_sharing)
    conversations = normalize_and_explode_conversations(chatgpt_sharing)
    conversations = filter_english_conversations(conversations)
    return conversations

# Ejecutar procesamiento
file_path = '../data/snapshot_20230831/20230831_061759_issue_sharings.json'
processed_conversations_2 = process_data(file_path)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 353 entries, 0 to 352
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Type            353 non-null    object
 1   URL             353 non-null    object
 2   Author          353 non-null    object
 3   RepoName        353 non-null    object
 4   RepoLanguage    320 non-null    object
 5   Number          353 non-null    int64 
 6   Title           353 non-null    object
 7   Body            353 non-null    object
 8   CreatedAt       353 non-null    object
 9   ClosedAt        163 non-null    object
 10  UpdatedAt       353 non-null    object
 11  State           353 non-null    object
 12  ChatgptSharing  353 non-null    object
dtypes: int64(1), object(12)
memory usage: 36.0+ KB
Dataframe inicial cargado: None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 353 entries, 0 to 352
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
--

In [4]:
len(processed_conversations_2)
processed_conversations_2.head()

Unnamed: 0,Prompt,Answer,ListOfCode,ClosedAt,CreatedAt,RepoLanguage,RepoName,State,UpdatedAt,conversationTitle,conversation_id,issueDesc,issueTitle,mentionProperty,mentionText,numPrompts,sourceURL,Detected_Language
28,How to run a java class inside of a container ...,To run a Java class inside a container using T...,"[{'ReplaceString': '[CODE_BLOCK_0]', 'Type': '...",,2023-07-07T20:30:07Z,Java,jabrena/aqa-tests-experiments,OPEN,2023-07-08T11:56:45Z,Java class in container,Java class in container_https://github.com/jab...,- https://chat.openai.com/share/e169e9a7-40c5-...,Run a test in multiple java distros,body,- https://chat.openai.com/share/e169e9a7-40c5-...,4.0,https://github.com/jabrena/aqa-tests-experimen...,en
29,I need to add a java class dynamically to exec...,To dynamically add and execute a Java class us...,"[{'ReplaceString': '[CODE_BLOCK_0]', 'Type': '...",,2023-07-07T20:30:07Z,Java,jabrena/aqa-tests-experiments,OPEN,2023-07-08T11:56:45Z,Java class in container,Java class in container_https://github.com/jab...,- https://chat.openai.com/share/e169e9a7-40c5-...,Run a test in multiple java distros,body,- https://chat.openai.com/share/e169e9a7-40c5-...,4.0,https://github.com/jabrena/aqa-tests-experimen...,en
30,How to add a java class in a generic container...,To add a Java class in a generic container fro...,[],,2023-07-07T20:30:07Z,Java,jabrena/aqa-tests-experiments,OPEN,2023-07-08T11:56:45Z,,_https://github.com/jabrena/aqa-tests-experime...,- https://chat.openai.com/share/e169e9a7-40c5-...,Run a test in multiple java distros,body,- https://chat.openai.com/share/e169e9a7-40c5-...,4.0,https://github.com/jabrena/aqa-tests-experimen...,en
31,Show me code only,Certainly! Here's an example code snippet that...,"[{'ReplaceString': '[CODE_BLOCK_0]', 'Type': '...",,2023-07-07T20:30:07Z,Java,jabrena/aqa-tests-experiments,OPEN,2023-07-08T11:56:45Z,,_https://github.com/jabrena/aqa-tests-experime...,- https://chat.openai.com/share/e169e9a7-40c5-...,Run a test in multiple java distros,body,- https://chat.openai.com/share/e169e9a7-40c5-...,4.0,https://github.com/jabrena/aqa-tests-experimen...,en
32,But I say to add a physical java class inside ...,Apologies for the confusion. If you want to ad...,"[{'ReplaceString': '[CODE_BLOCK_0]', 'Type': '...",,2023-07-07T20:30:07Z,Java,jabrena/aqa-tests-experiments,OPEN,2023-07-08T11:56:45Z,,_https://github.com/jabrena/aqa-tests-experime...,- https://chat.openai.com/share/e169e9a7-40c5-...,Run a test in multiple java distros,body,- https://chat.openai.com/share/e169e9a7-40c5-...,4.0,https://github.com/jabrena/aqa-tests-experimen...,en


In [5]:
# Después de cargar los datos
print("Contenido inicial de df['ChatgptSharing']:")
print(df['ChatgptSharing'].head())

# Después de reemplazar NaN en 'ChatgptSharing'
df['ChatgptSharing'] = df['ChatgptSharing'].apply(lambda x: x if isinstance(x, list) else [])
print("Después de reemplazar NaN en df['ChatgptSharing']:")
print(df['ChatgptSharing'].head())

# Después de explotar 'ChatgptSharing'
chatgpt_sharing = json_normalize(df['ChatgptSharing'].explode())
print("Después de explotar 'ChatgptSharing':")
print(chatgpt_sharing.head())

# Antes de explotar 'Conversations'
print("Contenido de df['Conversations'] antes de explotar:")
print(chatgpt_sharing['Conversations'].head())

# Después de explotar 'Conversations'
chatgpt_sharing['Conversations'] = chatgpt_sharing['Conversations'].apply(lambda x: x if isinstance(x, list) else [])
conversations = json_normalize(chatgpt_sharing['Conversations'].explode())
print("Después de explotar 'Conversations':")
print(conversations.head())




Contenido inicial de df['ChatgptSharing']:
0    [{'URL': 'https://chat.openai.com/share/795827...
1    [{'URL': 'https://chat.openai.com/share/e169e9...
2    [{'URL': 'https://chat.openai.com/share/340a32...
3    [{'URL': 'https://chat.openai.com/share/e94b0e...
4    [{'URL': 'https://chat.openai.com/share/1c4bc8...
Name: ChatgptSharing, dtype: object
Después de reemplazar NaN en df['ChatgptSharing']:
0    [{'URL': 'https://chat.openai.com/share/795827...
1    [{'URL': 'https://chat.openai.com/share/e169e9...
2    [{'URL': 'https://chat.openai.com/share/340a32...
3    [{'URL': 'https://chat.openai.com/share/e94b0e...
4    [{'URL': 'https://chat.openai.com/share/1c4bc8...
Name: ChatgptSharing, dtype: object
Después de explotar 'ChatgptSharing':
                                                 URL  Status  \
0  https://chat.openai.com/share/7958273c-0081-48...     200   
1  https://chat.openai.com/share/8b0f517f-1aaf-4b...     200   
2  https://chat.openai.com/share/e169e9a7-40c5-46...  

In [6]:
# Estadísticas clave de df['ChatgptSharing']
print("=== ChatgptSharing Summary ===")
print(f"Total rows in ChatgptSharing: {len(df['ChatgptSharing'])}")
print(f"Number of non-null entries: {df['ChatgptSharing'].notnull().sum()}")
print(f"Number of empty lists: {df['ChatgptSharing'].apply(lambda x: isinstance(x, list) and len(x) == 0).sum()}")
print(f"Sample of 5 entries:\n{df['ChatgptSharing'].head()}")

# Si ChatgptSharing fue explotado, revisar el DataFrame resultante
if 'chatgpt_sharing' in locals():
    print("\n=== Exploded ChatgptSharing Summary ===")
    print(f"Total rows after explosion: {len(chatgpt_sharing)}")
    print(f"Sample of 5 rows after explosion:\n{chatgpt_sharing.head()}")

# Estadísticas clave de df['Conversations']
print("\n=== Conversations Summary ===")
if 'Conversations' in chatgpt_sharing.columns:
    print(f"Total rows in Conversations: {len(chatgpt_sharing['Conversations'])}")
    print(f"Number of non-null entries: {chatgpt_sharing['Conversations'].notnull().sum()}")
    print(f"Number of empty lists: {chatgpt_sharing['Conversations'].apply(lambda x: isinstance(x, list) and len(x) == 0).sum()}")
    print(f"Sample of 5 entries:\n{chatgpt_sharing['Conversations'].head()}")

# Si Conversations fue explotado, revisar el DataFrame resultante
if 'conversations' in locals():
    print("\n=== Exploded Conversations Summary ===")
    print(f"Total rows after explosion: {len(conversations)}")
    print(f"Number of non-null entries in 'Prompt': {conversations['Prompt'].notnull().sum()}")
    print(f"Number of non-null entries in 'Answer': {conversations['Answer'].notnull().sum()}")
    print(f"Sample of 5 rows after explosion:\n{conversations.head()}")


=== ChatgptSharing Summary ===
Total rows in ChatgptSharing: 353
Number of non-null entries: 353
Number of empty lists: 0
Sample of 5 entries:
0    [{'URL': 'https://chat.openai.com/share/795827...
1    [{'URL': 'https://chat.openai.com/share/e169e9...
2    [{'URL': 'https://chat.openai.com/share/340a32...
3    [{'URL': 'https://chat.openai.com/share/e94b0e...
4    [{'URL': 'https://chat.openai.com/share/1c4bc8...
Name: ChatgptSharing, dtype: object

=== Exploded ChatgptSharing Summary ===
Total rows after explosion: 417
Sample of 5 rows after explosion:
                                                 URL  Status  \
0  https://chat.openai.com/share/7958273c-0081-48...     200   
1  https://chat.openai.com/share/8b0f517f-1aaf-4b...     200   
2  https://chat.openai.com/share/e169e9a7-40c5-46...     200   
3  https://chat.openai.com/share/b508ddd3-af83-42...     200   
4  https://chat.openai.com/share/340a3228-4be2-4f...     200   

  DateOfConversation                DateOfAccess      

In [7]:
# Inspeccionar listas vacías y valores nulos en 'ChatgptSharing'
empty_lists = df['ChatgptSharing'].apply(lambda x: isinstance(x, list) and len(x) == 0).sum()
null_values = df['ChatgptSharing'].isnull().sum()

print(f"Número de listas vacías en 'ChatgptSharing': {empty_lists}")
print(f"Número de valores nulos en 'ChatgptSharing': {null_values}")

# Después de explotar 'ChatgptSharing'
exploded_nulls = chatgpt_sharing.isnull().sum()
print("Valores nulos después de explotar 'ChatgptSharing':")
print(exploded_nulls)

# Inspeccionar listas vacías en 'Conversations'
empty_lists_conversations = chatgpt_sharing['Conversations'].apply(lambda x: isinstance(x, list) and len(x) == 0).sum()
null_values_conversations = chatgpt_sharing['Conversations'].isnull().sum()

print(f"Número de listas vacías en 'Conversations': {empty_lists_conversations}")
print(f"Número de valores nulos en 'Conversations': {null_values_conversations}")


Número de listas vacías en 'ChatgptSharing': 0
Número de valores nulos en 'ChatgptSharing': 0
Valores nulos después de explotar 'ChatgptSharing':
URL                           0
Status                        0
DateOfConversation           33
DateOfAccess                 33
Title                        33
NumberOfPrompts              33
TokensOfPrompts              33
TokensOfAnswers              33
Model                        33
Conversations                 0
HTMLContent                  33
Mention.MentionedURL          0
Mention.MentionedProperty     0
Mention.MentionedAuthor       0
Mention.MentionedText         0
dtype: int64
Número de listas vacías en 'Conversations': 33
Número de valores nulos en 'Conversations': 0


In [8]:
# Antes de la propagación
print("Antes de propagar columnas en 'ChatgptSharing':")
print(chatgpt_sharing.head())
print(f"Número de filas en 'ChatgptSharing': {len(chatgpt_sharing)}")

# Propagar columnas
columns_to_propagate = df.columns.difference(['ChatgptSharing'])
for col in columns_to_propagate:
    chatgpt_sharing[col] = df[col].repeat(df['ChatgptSharing'].apply(len)).reset_index(drop=True)

# Después de la propagación
print("Después de propagar columnas en 'ChatgptSharing':")
print(chatgpt_sharing.head())
print(f"Número de filas en 'ChatgptSharing': {len(chatgpt_sharing)}")

# Verificar propagación en 'Conversations'
print("Antes de propagar columnas en 'Conversations':")
print(conversations.head())
print(f"Número de filas en 'Conversations': {len(conversations)}")

# Propagar columnas en 'Conversations'
columns_to_propagate_conversations = chatgpt_sharing.columns.difference(['Conversations'])
for col in columns_to_propagate_conversations:
    conversations[col] = chatgpt_sharing[col].repeat(chatgpt_sharing['Conversations'].apply(len)).reset_index(drop=True)

print("Después de propagar columnas en 'Conversations':")
print(conversations.head())
print(f"Número de filas en 'Conversations': {len(conversations)}")


Antes de propagar columnas en 'ChatgptSharing':
                                                 URL  Status  \
0  https://chat.openai.com/share/7958273c-0081-48...     200   
1  https://chat.openai.com/share/8b0f517f-1aaf-4b...     200   
2  https://chat.openai.com/share/e169e9a7-40c5-46...     200   
3  https://chat.openai.com/share/b508ddd3-af83-42...     200   
4  https://chat.openai.com/share/340a3228-4be2-4f...     200   

  DateOfConversation                DateOfAccess                    Title  \
0       July 6, 2023  2023-08-31 06:07:21.607995          ハンバーガーメニュー表示の修正   
1       July 6, 2023  2023-08-31 06:07:22.803198           モバイルハンバーガーメニュー   
2       July 7, 2023  2023-08-31 06:07:23.748611  Java class in container   
3       July 8, 2023  2023-08-31 06:07:24.502512                            
4    August 26, 2023  2023-08-31 06:07:25.430598     LOTR Easter Egg Idea   

   NumberOfPrompts  TokensOfPrompts  TokensOfAnswers    Model  \
0             11.0           2637.0    