# Data Cleaning

In [1]:
import pandas as pd
import helper
import os

In [2]:
# Set up file paths and target tables.

snapshots = [
    "snapshot_20230727", "snapshot_20230831"
    ]
dir_path = "~/DevGPT/"
target = "pr"
URL_rename = "URL_pr"

In [3]:
# Read all json file paths.
file_paths = []
for snapshot in snapshots:
    file_path = os.path.join(dir_path, snapshot)
    file_paths += helper.read_filepaths(file_path, target)

In [4]:
# Load Json files.
df = helper.load_dataframes(file_paths)

# Rename URL to URL_pr.
df.rename(columns={"URL": URL_rename}, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 340 entries, 0 to 192
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Type               340 non-null    object
 1   URL_pr             340 non-null    object
 2   Author             340 non-null    object
 3   RepoName           340 non-null    object
 4   RepoLanguage       334 non-null    object
 5   Number             340 non-null    int64 
 6   Title              340 non-null    object
 7   Body               340 non-null    object
 8   CreatedAt          340 non-null    object
 9   ClosedAt           300 non-null    object
 10  MergedAt           254 non-null    object
 11  UpdatedAt          340 non-null    object
 12  State              340 non-null    object
 13  Additions          340 non-null    int64 
 14  Deletions          340 non-null    int64 
 15  ChangedFiles       340 non-null    int64 
 16  CommitsTotalCount  340 non-null    int64 
 17  Co

## ChatGPTSharing Table

In [5]:
records = []

for idx, row in df[[URL_rename, "RepoName", "ChatgptSharing"]].iterrows():
    for item in row["ChatgptSharing"]:
        obs = {}
        obs[URL_rename] = row[URL_rename]
        obs["RepoName"] = row["RepoName"]
        obs.update(item)
        records.append(obs)

df_chatgpt_sharing = pd.DataFrame(records)

# Rename URL to URL_chatgptsharing.
df_chatgpt_sharing.rename(columns={"URL": "URL_chatgptsharing"}, inplace=True)
df_chatgpt_sharing.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 394 entries, 0 to 393
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   URL_pr              394 non-null    object 
 1   RepoName            394 non-null    object 
 2   URL_chatgptsharing  394 non-null    object 
 3   Mention             394 non-null    object 
 4   Status              394 non-null    int64  
 5   DateOfConversation  368 non-null    object 
 6   DateOfAccess        368 non-null    object 
 7   Title               368 non-null    object 
 8   NumberOfPrompts     368 non-null    float64
 9   TokensOfPrompts     368 non-null    float64
 10  TokensOfAnswers     368 non-null    float64
 11  Model               368 non-null    object 
 12  Conversations       368 non-null    object 
 13  HTMLContent         368 non-null    object 
dtypes: float64(3), int64(1), object(10)
memory usage: 43.2+ KB


## Mention table

`MentionedURL` is identical with `URL_[target]`

In [7]:
records = []

for idx, row in df_chatgpt_sharing[
    [URL_rename, "RepoName", "URL_chatgptsharing", "Mention"]
    ].iterrows():

    if not isinstance(row["Mention"], dict):
        continue

    obs = {}
    obs[URL_rename] = row[URL_rename]
    obs["RepoName"] = row["RepoName"]
    obs["URL_chatgptsharing"] = row["URL_chatgptsharing"]
    obs.update(row["Mention"])
    records.append(obs)

df_mention = pd.DataFrame(records)
df_mention.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 394 entries, 0 to 393
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   URL_pr              394 non-null    object
 1   RepoName            394 non-null    object
 2   URL_chatgptsharing  394 non-null    object
 3   MentionedURL        394 non-null    object
 4   MentionedProperty   394 non-null    object
 5   MentionedAuthor     394 non-null    object
 6   MentionedText       394 non-null    object
 7   MentionedPath       128 non-null    object
dtypes: object(8)
memory usage: 24.8+ KB


## Conversation Table

In [8]:
records = []

for idx, row in df_chatgpt_sharing[
    [URL_rename, "RepoName", "URL_chatgptsharing", "Conversations"]
    ].iterrows():

    if not isinstance(row["Conversations"], list):
        continue

    for item in row["Conversations"]:
        obs = {}
        obs[URL_rename] = row[URL_rename]
        obs["RepoName"] = row["RepoName"]
        obs["URL_chatgptsharing"] = row["URL_chatgptsharing"]
        obs.update(item)
        records.append(obs)

df_conversation = pd.DataFrame(records)
df_conversation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1667 entries, 0 to 1666
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   URL_pr              1667 non-null   object
 1   RepoName            1667 non-null   object
 2   URL_chatgptsharing  1667 non-null   object
 3   Prompt              1667 non-null   object
 4   Answer              1667 non-null   object
 5   ListOfCode          1667 non-null   object
dtypes: object(6)
memory usage: 78.3+ KB


## Clean Up
4 tables: `df_[target]`, `df_chatgpt_sharing`, `df_mention`, `df_conversation`

In [9]:
# Remove redundant columns
df = df.drop(columns="ChatgptSharing")
df_chatgpt_sharing = df_chatgpt_sharing.drop(columns=["Mention", "Conversations"])

## Save to file

In [10]:
# df dataframe contains the initial dataset
df.to_csv(
    os.path.join(dir_path, "cleaned", target + ".csv")
    )

# df_chatgpt_sharing dataframe contains chatgptsharing content in target table.
# It can combine with other tables using URL_'target' and RepoName.
df_chatgpt_sharing.to_csv(
    os.path.join(dir_path, "cleaned", target + "_chatgpt_sharing.csv")
    )

# df_mention dataframe contains mention content in target table.
# It can combine with other tables using URL_'target' and RepoName.
df_mention.to_csv(
    os.path.join(dir_path, "cleaned", target + "_mention.csv")
    )

# df_conversation dataframe contains conversation content in target table.
# It can combine with other tables using URL_'target' and RepoName.
df_conversation.to_csv(
    os.path.join(dir_path, "cleaned", target + "_conversation.csv")
    )

## Combine to a big dataframe

`df_total` is the dataframe that contains all the information of the target table.
Combining `chatgptsharing`, `mention`, and `conversation`

In [11]:
# Merge ChatGptSharing table to target table.
df_total = pd.merge(
    df, df_chatgpt_sharing,
    left_on=[URL_rename, "RepoName"], 
    right_on=[URL_rename, "RepoName"], 
    how="left"
)

In [12]:
# Merge Mention table to target table.
df_total = pd.merge(
    df_total, df_mention,
    left_on=[URL_rename, "RepoName"], 
    right_on=[URL_rename, "RepoName"], 
    how="left"
)

In [13]:
# Merge Conversation table to target table.
df_total = pd.merge(
    df_total, df_conversation,
    left_on=[URL_rename, "RepoName"], 
    right_on=[URL_rename, "RepoName"], 
    how="left"
)

In [14]:
df_total.columns

Index(['Type', 'URL_pr', 'Author', 'RepoName', 'RepoLanguage', 'Number',
       'Title_x', 'Body', 'CreatedAt', 'ClosedAt', 'MergedAt', 'UpdatedAt',
       'State', 'Additions', 'Deletions', 'ChangedFiles', 'CommitsTotalCount',
       'CommitShas', 'CommitSha', 'source_date', 'URL_chatgptsharing_x',
       'Status', 'DateOfConversation', 'DateOfAccess', 'Title_y',
       'NumberOfPrompts', 'TokensOfPrompts', 'TokensOfAnswers', 'Model',
       'HTMLContent', 'URL_chatgptsharing_y', 'MentionedURL',
       'MentionedProperty', 'MentionedAuthor', 'MentionedText',
       'MentionedPath', 'URL_chatgptsharing', 'Prompt', 'Answer',
       'ListOfCode'],
      dtype='object')

In [15]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28348 entries, 0 to 28347
Data columns (total 40 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Type                  28348 non-null  object 
 1   URL_pr                28348 non-null  object 
 2   Author                28348 non-null  object 
 3   RepoName              28348 non-null  object 
 4   RepoLanguage          28252 non-null  object 
 5   Number                28348 non-null  int64  
 6   Title_x               28348 non-null  object 
 7   Body                  28348 non-null  object 
 8   CreatedAt             28348 non-null  object 
 9   ClosedAt              27642 non-null  object 
 10  MergedAt              23310 non-null  object 
 11  UpdatedAt             28348 non-null  object 
 12  State                 28348 non-null  object 
 13  Additions             28348 non-null  int64  
 14  Deletions             28348 non-null  int64  
 15  ChangedFiles       

In [16]:
# Save to file.

df_total.to_csv(os.path.join(dir_path, "cleaned", target + "_total.csv"))