# Data extraction

In [7]:
import os
import pandas as pd

# Local directory with all the JSON files
folder_path = "../data/snapshot_20230831"

# List for the mother data frames
mother_data_frames = []
mother_file_names = []  # List to store the names of the JSON files

# Loop through files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith(".json"):  # Only JSON files
        file_path = os.path.join(folder_path, file_name) 
        print(f"Loading file: {file_name}")
        
        # Read JSON file and add it to the list
        try:
            df = pd.read_json(file_path)
            mother_data_frames.append(df)
            mother_file_names.append(file_name)  # Store the corresponding file name
        except ValueError as e:
            print(f"Error loading {file_name}: {e}")
            
# Print the list of file names
print("Corresponding file names for DataFrames:")
print(mother_file_names)


Loading file: 20230831_060603_pr_sharings.json
Loading file: 20230831_061759_issue_sharings.json
Loading file: 20230831_061926_discussion_sharings.json
Loading file: 20230831_063412_commit_sharings.json
Loading file: 20230831_072722_file_sharings.json
Loading file: 20230831_073827_hn_sharings.json
Corresponding file names for DataFrames:
['20230831_060603_pr_sharings.json', '20230831_061759_issue_sharings.json', '20230831_061926_discussion_sharings.json', '20230831_063412_commit_sharings.json', '20230831_072722_file_sharings.json', '20230831_073827_hn_sharings.json']


In [11]:
from pandas import json_normalize
import ast

# Acceder al primer DataFrame en la lista
sharings_df = mother_data_frames[0]

# Normalizar los datos si contienen estructuras anidadas
sharings_df = json_normalize(sharings_df['Sources'])

# Mostrar las primeras filas del DataFrame normalizado
print(sharings_df.head())

           Type                                                URL  \
0  pull request   https://github.com/daeuniverse/dae-wing/pull/115   
1  pull request  https://github.com/FlorianWoelki/obsidian-symb...   
2  pull request            https://github.com/labdao/plex/pull/469   
3  pull request            https://github.com/labdao/plex/pull/468   
4  pull request  https://github.com/mlc-ai/web-stable-diffusion...   

              Author                                   RepoName  \
0            dae-bot                       daeuniverse/dae-wing   
1  sharshuv-quotient  FlorianWoelki/obsidian-symbols-prettifier   
2          AdamGoyer                                labdao/plex   
3          AdamGoyer                                labdao/plex   
4             jchris                mlc-ai/web-stable-diffusion   

       RepoLanguage  Number  \
0                Go     115   
1        TypeScript       7   
2                Go     469   
3                Go     468   
4  Jupyter Notebook  

In [12]:
# Normalize chatgptsharing
sharing_gpt = json_normalize(sharings_df['ChatgptSharing'].explode())

print(sharing_gpt.head())



                                                 URL  Status  \
0  https://chat.openai.com/share/9fb7d601-2a86-40...     200   
1  https://chat.openai.com/share/acece149-6d4b-45...     200   
2  https://chat.openai.com/share/8bd33825-e8c6-44...     200   
3  https://chat.openai.com/share/2c4b0dba-429c-4c...     200   
4  https://chat.openai.com/share/a3f23f8c-935e-4b...     200   

  DateOfConversation                DateOfAccess  \
0    August 25, 2023  2023-08-31 05:56:44.392433   
1    August 23, 2023  2023-08-31 05:56:46.273656   
2       July 5, 2023  2023-08-31 05:56:46.953811   
3       July 5, 2023  2023-08-31 05:56:48.306546   
4    August 30, 2023  2023-08-31 05:56:50.885307   

                                     Title  NumberOfPrompts  TokensOfPrompts  \
0        Translate Technical Documentation              4.0           1653.0   
1              Avoid Prototype Chain Issue              2.0            223.0   
2                       Open Babel on PLEX              6.0   

In [13]:
# Normalize Conversation
sharing_conv = json_normalize(sharing_gpt['Conversations'].explode())

print(sharing_conv.head())


                                              Prompt  \
0  Hey, I am working on writing a technical docum...   
1  I noticed that I've already have translated co...   
2  # Working Principle of dae\n\n[**简体中文**](../zh...   
3                             Is the gramma refined?   
4  I have this code:\n\nconst characterMap: Chara...   

                                              Answer  \
0  Of course, I'd be happy to help you translate ...   
1  Absolutely, I'd be glad to help you refine the...   
2  Working Principle of dae简体中文 | Englishdae oper...   
3  Yes, the grammar has been refined in the provi...   
4  This issue arises because object property look...   

                                          ListOfCode  
0                                                 []  
1                                                 []  
2                                                 []  
3                                                 []  
4  [{'ReplaceString': '[CODE_BLOCK_0]', 'Type': '..