# Data Cleaning

In [76]:
import pandas as pd
import helper
import os

In [77]:
# Set up file paths and target tables.

snapshots = ["snapshot_20230727", "snapshot_20230831"]
dir_path = "~/DevGPT/"

# Options are "hn", "pr", "issue", "discussion", "commit", "file"
target = "file"
URL_rename = "URL_" + target

if target == "hn":
    pk = [URL_rename]
else:
    pk = [URL_rename, "RepoName"]

In [78]:
# Read all json file paths.
file_paths = []
for snapshot in snapshots:
    file_path = os.path.join(dir_path, snapshot)
    file_paths += helper.read_filepaths(file_path, target)

In [79]:
# Load Json files.
df = helper.load_dataframes(file_paths)

# Rename URL to URL_[target].
df.rename(columns={"URL": URL_rename}, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1717 entries, 0 to 969
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Type            1717 non-null   object
 1   URL_file        1717 non-null   object
 2   ObjectSha       1717 non-null   object
 3   FileName        1717 non-null   object
 4   FilePath        1717 non-null   object
 5   Author          1717 non-null   object
 6   Content         1717 non-null   object
 7   RepoName        1717 non-null   object
 8   RepoLanguage    1387 non-null   object
 9   CommitSha       1717 non-null   object
 10  CommitMessage   1717 non-null   object
 11  AuthorAt        1717 non-null   object
 12  CommitAt        1717 non-null   object
 13  ChatgptSharing  1717 non-null   object
 14  source_date     1717 non-null   object
dtypes: object(15)
memory usage: 214.6+ KB


## ChatGPTSharing Table

In [80]:
records = []

pk += ["ChatgptSharing"]

for idx, row in df[pk].iterrows():
    for item in row["ChatgptSharing"]:
        obs = {}
        obs[URL_rename] = row[URL_rename]

        if target != "hn":
            obs["RepoName"] = row["RepoName"]
            
        obs.update(item)
        records.append(obs)

df_chatgpt_sharing = pd.DataFrame(records)

# Rename URL to URL_chatgptsharing.
df_chatgpt_sharing.rename(columns={"URL": "URL_chatgptsharing"}, inplace=True)
df_chatgpt_sharing.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2436 entries, 0 to 2435
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   URL_file            2436 non-null   object 
 1   RepoName            2436 non-null   object 
 2   URL_chatgptsharing  2436 non-null   object 
 3   Mention             2436 non-null   object 
 4   Status              2436 non-null   int64  
 5   DateOfConversation  2263 non-null   object 
 6   DateOfAccess        2263 non-null   object 
 7   Title               2263 non-null   object 
 8   NumberOfPrompts     2263 non-null   float64
 9   TokensOfPrompts     2263 non-null   float64
 10  TokensOfAnswers     2263 non-null   float64
 11  Model               2263 non-null   object 
 12  Conversations       2263 non-null   object 
 13  HTMLContent         2263 non-null   object 
dtypes: float64(3), int64(1), object(10)
memory usage: 266.6+ KB


## Mention table

`MentionedURL` is identical with `URL_[target]`

In [81]:
records = []

pk.pop() 
pk += ["URL_chatgptsharing", "Mention"]

for idx, row in df_chatgpt_sharing[pk].iterrows():

    if not isinstance(row["Mention"], dict):
        continue

    obs = {}
    obs[URL_rename] = row[URL_rename]

    if target != "hn":
        obs["RepoName"] = row["RepoName"]
        
    obs["URL_chatgptsharing"] = row["URL_chatgptsharing"]
    obs.update(row["Mention"])
    records.append(obs)

df_mention = pd.DataFrame(records)
df_mention.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2436 entries, 0 to 2435
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   URL_file            2436 non-null   object
 1   RepoName            2436 non-null   object
 2   URL_chatgptsharing  2436 non-null   object
 3   MentionedURL        2436 non-null   object
 4   MentionedProperty   2436 non-null   object
 5   MentionedAuthor     2436 non-null   object
 6   MentionedText       2436 non-null   object
dtypes: object(7)
memory usage: 133.3+ KB


## Conversation Table

In [82]:
records = []

pk += ["Conversations"]
for idx, row in df_chatgpt_sharing[pk].iterrows():

    if not isinstance(row["Conversations"], list):
        continue

    for item in row["Conversations"]:
        obs = {}
        obs[URL_rename] = row[URL_rename]

        if target != "hn":
            obs["RepoName"] = row["RepoName"]
            
        obs["URL_chatgptsharing"] = row["URL_chatgptsharing"]
        obs.update(item)
        records.append(obs)

df_conversation = pd.DataFrame(records)
df_conversation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19267 entries, 0 to 19266
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   URL_file            19267 non-null  object
 1   RepoName            19267 non-null  object
 2   URL_chatgptsharing  19267 non-null  object
 3   Prompt              19267 non-null  object
 4   Answer              19267 non-null  object
 5   ListOfCode          19267 non-null  object
dtypes: object(6)
memory usage: 903.3+ KB


## Clean Up
4 tables: `df_[target]`, `df_chatgpt_sharing`, `df_mention`, `df_conversation`

In [83]:
# Remove redundant columns
df = df.drop(columns="ChatgptSharing")
df_chatgpt_sharing = df_chatgpt_sharing.drop(columns=["Mention", "Conversations"])

## Save to file

In [84]:
# df dataframe contains the initial dataset
df.to_csv(
    os.path.join(dir_path, "cleaned", target + ".csv")
    )

# df_chatgpt_sharing dataframe contains chatgptsharing content in target table.
# It can combine with other tables using URL_'target' and RepoName.
df_chatgpt_sharing.to_csv(
    os.path.join(dir_path, "cleaned", target + "_chatgpt_sharing.csv")
    )

# df_mention dataframe contains mention content in target table.
# It can combine with other tables using URL_'target' and RepoName.
df_mention.to_csv(
    os.path.join(dir_path, "cleaned", target + "_mention.csv")
    )

# df_conversation dataframe contains conversation content in target table.
# It can combine with other tables using URL_'target' and RepoName.
df_conversation.to_csv(
    os.path.join(dir_path, "cleaned", target + "_conversation.csv")
    )

## Combine to a big dataframe

`df_total` is the dataframe that contains all the information of the target table.
Combining `chatgptsharing`, `mention`, and `conversation`

In [85]:
# Merge ChatGptSharing table to target table.
if target != "hn":
    merge_on = [URL_rename, "RepoName"]
else:
    merge_on = [URL_rename]

df_total = pd.merge(
    df, df_chatgpt_sharing,
    left_on=merge_on, 
    right_on=merge_on, 
    how="left"
)

In [86]:
# Merge Mention table to target table.
df_total = pd.merge(
    df_total, df_mention,
    left_on=merge_on, 
    right_on=merge_on, 
    how="left"
)

In [87]:
# Merge Conversation table to target table.
df_total = pd.merge(
    df_total, df_conversation,
    left_on=merge_on, 
    right_on=merge_on, 
    how="left"
)

In [88]:
df_total.columns

Index(['Type', 'URL_file', 'ObjectSha', 'FileName', 'FilePath', 'Author',
       'Content', 'RepoName', 'RepoLanguage', 'CommitSha', 'CommitMessage',
       'AuthorAt', 'CommitAt', 'source_date', 'URL_chatgptsharing_x', 'Status',
       'DateOfConversation', 'DateOfAccess', 'Title', 'NumberOfPrompts',
       'TokensOfPrompts', 'TokensOfAnswers', 'Model', 'HTMLContent',
       'URL_chatgptsharing_y', 'MentionedURL', 'MentionedProperty',
       'MentionedAuthor', 'MentionedText', 'URL_chatgptsharing', 'Prompt',
       'Answer', 'ListOfCode'],
      dtype='object')

In [89]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 935541 entries, 0 to 935540
Data columns (total 33 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Type                  935541 non-null  object 
 1   URL_file              935541 non-null  object 
 2   ObjectSha             935541 non-null  object 
 3   FileName              935541 non-null  object 
 4   FilePath              935541 non-null  object 
 5   Author                935541 non-null  object 
 6   Content               935541 non-null  object 
 7   RepoName              935541 non-null  object 
 8   RepoLanguage          810495 non-null  object 
 9   CommitSha             935541 non-null  object 
 10  CommitMessage         935541 non-null  object 
 11  AuthorAt              935541 non-null  object 
 12  CommitAt              935541 non-null  object 
 13  source_date           935541 non-null  object 
 14  URL_chatgptsharing_x  935541 non-null  object 
 15  

In [90]:
# Save to file.

# df_total.to_csv(os.path.join(dir_path, "cleaned", target + "_total.csv"))