# Data Cleaning

In [1]:
import pandas as pd
import helper
import os

In [16]:
# Set up file paths and target tables.

snapshots = ["snapshot_20230727"]
dir_path = "/Users/teng/UBCO/mds_labs/block4/542/data/DevGPT/"

# Options are "hn", "pr", "issue", "discussion", "commit", "file"
target = "discussion"
URL_rename = "URL_" + target

if target == "hn":
    pk = [URL_rename]
else:
    pk = [URL_rename, "RepoName"]

In [17]:
# Read all json file paths.
file_paths = []
for snapshot in snapshots:
    file_path = os.path.join(dir_path, snapshot)
    file_paths += helper.read_filepaths(file_path, target)

In [18]:
# Load Json files.
df = helper.load_dataframes(file_paths)

# Rename URL to URL_[target].
df.rename(columns={"URL": URL_rename}, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Type            32 non-null     object
 1   URL_discussion  32 non-null     object
 2   Author          32 non-null     object
 3   RepoName        32 non-null     object
 4   RepoLanguage    27 non-null     object
 5   Number          32 non-null     int64 
 6   Title           32 non-null     object
 7   Body            32 non-null     object
 8   CreatedAt       32 non-null     object
 9   ClosedAt        4 non-null      object
 10  UpdatedAt       32 non-null     object
 11  Closed          32 non-null     bool  
 12  UpvoteCount     32 non-null     int64 
 13  ChatgptSharing  32 non-null     object
 14  source_date     32 non-null     object
dtypes: bool(1), int64(2), object(12)
memory usage: 3.7+ KB


## ChatGPTSharing Table

In [19]:
records = []

pk += ["ChatgptSharing"]

for idx, row in df[pk].iterrows():
    for item in row["ChatgptSharing"]:
        obs = {}
        obs[URL_rename] = row[URL_rename]

        if target != "hn":
            obs["RepoName"] = row["RepoName"]
            
        obs.update(item)
        records.append(obs)

df_chatgpt_sharing = pd.DataFrame(records)

# Rename URL to URL_chatgptsharing.
df_chatgpt_sharing.rename(columns={"URL": "URL_chatgptsharing"}, inplace=True)
df_chatgpt_sharing.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   URL_discussion      38 non-null     object 
 1   RepoName            38 non-null     object 
 2   URL_chatgptsharing  38 non-null     object 
 3   Mention             38 non-null     object 
 4   Status              38 non-null     int64  
 5   DateOfConversation  33 non-null     object 
 6   DateOfAccess        33 non-null     object 
 7   Title               33 non-null     object 
 8   NumberOfPrompts     33 non-null     float64
 9   TokensOfPrompts     33 non-null     float64
 10  TokensOfAnswers     33 non-null     float64
 11  Model               33 non-null     object 
 12  Conversations       33 non-null     object 
 13  HTMLContent         33 non-null     object 
dtypes: float64(3), int64(1), object(10)
memory usage: 4.3+ KB


## Mention table

`MentionedURL` is identical with `URL_[target]`

In [20]:
records = []

pk.pop() 
pk += ["URL_chatgptsharing", "Mention"]

for idx, row in df_chatgpt_sharing[pk].iterrows():

    if not isinstance(row["Mention"], dict):
        continue

    obs = {}
    obs[URL_rename] = row[URL_rename]

    if target != "hn":
        obs["RepoName"] = row["RepoName"]
        
    obs["URL_chatgptsharing"] = row["URL_chatgptsharing"]
    obs.update(row["Mention"])
    records.append(obs)

df_mention = pd.DataFrame(records)
df_mention.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   URL_discussion        38 non-null     object 
 1   RepoName              38 non-null     object 
 2   URL_chatgptsharing    38 non-null     object 
 3   MentionedURL          38 non-null     object 
 4   MentionedProperty     38 non-null     object 
 5   MentionedAuthor       38 non-null     object 
 6   MentionedText         38 non-null     object 
 7   MentionedIsAnswer     19 non-null     object 
 8   MentionedUpvoteCount  19 non-null     float64
dtypes: float64(1), object(8)
memory usage: 2.8+ KB


## Conversation Table

In [21]:
records = []

pk += ["Conversations"]
for idx, row in df_chatgpt_sharing[pk].iterrows():

    if not isinstance(row["Conversations"], list):
        continue

    for item in row["Conversations"]:
        obs = {}
        obs[URL_rename] = row[URL_rename]

        if target != "hn":
            obs["RepoName"] = row["RepoName"]
            
        obs["URL_chatgptsharing"] = row["URL_chatgptsharing"]
        obs.update(item)
        records.append(obs)

df_conversation = pd.DataFrame(records)
df_conversation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174 entries, 0 to 173
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   URL_discussion      174 non-null    object
 1   RepoName            174 non-null    object
 2   URL_chatgptsharing  174 non-null    object
 3   Prompt              174 non-null    object
 4   Answer              174 non-null    object
 5   ListOfCode          174 non-null    object
dtypes: object(6)
memory usage: 8.3+ KB


## Clean Up
4 tables: `df_[target]`, `df_chatgpt_sharing`, `df_mention`, `df_conversation`

In [22]:
# Remove redundant columns
df = df.drop(columns="ChatgptSharing")
df_chatgpt_sharing = df_chatgpt_sharing.drop(columns=["Mention", "Conversations"])

## Save to file

In [23]:
# df dataframe contains the initial dataset
df.to_csv(
    os.path.join(dir_path, "cleaned", target + ".csv")
    )

# df_chatgpt_sharing dataframe contains chatgptsharing content in target table.
# It can combine with other tables using URL_'target' and RepoName.
df_chatgpt_sharing.to_csv(
    os.path.join(dir_path, "cleaned", target + "_chatgpt_sharing.csv")
    )

# df_mention dataframe contains mention content in target table.
# It can combine with other tables using URL_'target' and RepoName.
df_mention.to_csv(
    os.path.join(dir_path, "cleaned", target + "_mention.csv")
    )

# df_conversation dataframe contains conversation content in target table.
# It can combine with other tables using URL_'target' and RepoName.
df_conversation.to_csv(
    os.path.join(dir_path, "cleaned", target + "_conversation.csv")
    )

## Combine to a big dataframe

`df_total` is the dataframe that contains all the information of the target table.
Combining `chatgptsharing`, `mention`, and `conversation`

In [24]:
# Merge ChatGptSharing table to target table.
if target != "hn":
    merge_on = [URL_rename, "RepoName"]
else:
    merge_on = [URL_rename]

df_total = pd.merge(
    df, df_chatgpt_sharing,
    left_on=merge_on, 
    right_on=merge_on, 
    how="left"
)

In [25]:
# Merge Mention table to target table.
df_total = pd.merge(
    df_total, df_mention,
    left_on=merge_on, 
    right_on=merge_on, 
    how="left"
)

In [26]:
# Merge Conversation table to target table.
df_total = pd.merge(
    df_total, df_conversation,
    left_on=merge_on, 
    right_on=merge_on, 
    how="left"
)

In [27]:
df_total.columns

Index(['Type', 'URL_discussion', 'Author', 'RepoName', 'RepoLanguage',
       'Number', 'Title_x', 'Body', 'CreatedAt', 'ClosedAt', 'UpdatedAt',
       'Closed', 'UpvoteCount', 'source_date', 'URL_chatgptsharing_x',
       'Status', 'DateOfConversation', 'DateOfAccess', 'Title_y',
       'NumberOfPrompts', 'TokensOfPrompts', 'TokensOfAnswers', 'Model',
       'HTMLContent', 'URL_chatgptsharing_y', 'MentionedURL',
       'MentionedProperty', 'MentionedAuthor', 'MentionedText',
       'MentionedIsAnswer', 'MentionedUpvoteCount', 'URL_chatgptsharing',
       'Prompt', 'Answer', 'ListOfCode'],
      dtype='object')

In [28]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 317 entries, 0 to 316
Data columns (total 35 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Type                  317 non-null    object 
 1   URL_discussion        317 non-null    object 
 2   Author                317 non-null    object 
 3   RepoName              317 non-null    object 
 4   RepoLanguage          292 non-null    object 
 5   Number                317 non-null    int64  
 6   Title_x               317 non-null    object 
 7   Body                  317 non-null    object 
 8   CreatedAt             317 non-null    object 
 9   ClosedAt              4 non-null      object 
 10  UpdatedAt             317 non-null    object 
 11  Closed                317 non-null    bool   
 12  UpvoteCount           317 non-null    int64  
 13  source_date           317 non-null    object 
 14  URL_chatgptsharing_x  317 non-null    object 
 15  Status                3

In [29]:
# Save to file.

df_total.to_csv(os.path.join(dir_path, "cleaned", target + "_total.csv"))