# Pull Request Table - Data Cleaning

In [48]:
import json
import pandas as pd
import helper
import os

In [58]:
# Read all json file paths.
snapshots = [
    "snapshot_20230727", "snapshot_20230803", "snapshot_20230810", 
    "snapshot_20230817", "snapshot_20230824", "snapshot_20230831"
    ]
dir_path = "/Users/teng/UBCO/mds_labs/block4/542/data/DevGPT/"

file_paths = []
for snapshot in snapshots:
    file_path = os.path.join(dir_path, snapshot)
    file_paths += helper.read_filepaths(file_path, "pr")

In [63]:
# Load Json files.
df = helper.load_dataframes(file_paths)

# Rename URL to URL_pr.
df.rename(columns={"URL": "URL_pr"}, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147 entries, 0 to 146
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Type               147 non-null    object
 1   URL_pr             147 non-null    object
 2   Author             147 non-null    object
 3   RepoName           147 non-null    object
 4   RepoLanguage       144 non-null    object
 5   Number             147 non-null    int64 
 6   Title              147 non-null    object
 7   Body               147 non-null    object
 8   CreatedAt          147 non-null    object
 9   ClosedAt           130 non-null    object
 10  MergedAt           110 non-null    object
 11  UpdatedAt          147 non-null    object
 12  State              147 non-null    object
 13  Additions          147 non-null    int64 
 14  Deletions          147 non-null    int64 
 15  ChangedFiles       147 non-null    int64 
 16  CommitsTotalCount  147 non-null    int64 
 1

## ChatGPTSharing Table

In [None]:
# chatgpt_sharing_columns = [
#     "URL", "Mention", "Status", "DateOfConversation", "DateOfAccess", 
#     "NumberOfPrompts", "TokensOfPrompts", "TokensOfAnswers", "Model", 
#     "Conversations", "HTMLContent"
#     ]

# Create ChatGPTSharing table.
# chatgpt_sharing_columns += ["URL_Pr", "RepoName"]
records = []

for idx, row in df[["URL_pr", "RepoName", "ChatgptSharing"]].iterrows():
    for item in row["ChatgptSharing"]:
        obs = {}
        obs["URL_pr"] = row["URL_pr"]
        obs["RepoName"] = row["RepoName"]
        obs.update(item)
        records.append(obs)

df_chatgpt_sharing = pd.DataFrame(records)

# Rename URL to URL_chatgptsharing.
df_chatgpt_sharing.rename(columns={"URL": "URL_chatgptsharing"}, inplace=True)
df_chatgpt_sharing.info()


Unnamed: 0,URL_pr,RepoName,URL_chatgptsharing,Mention,Status,DateOfConversation,DateOfAccess,Title,NumberOfPrompts,TokensOfPrompts,TokensOfAnswers,Model,Conversations,HTMLContent
0,https://github.com/labdao/plex/pull/469,labdao/plex,https://chat.openai.com/share/8bd33825-e8c6-44...,{'MentionedURL': 'https://github.com/labdao/pl...,200,"July 5, 2023",2023-07-27 13:08:47.672568,Open Babel on PLEX,6.0,2895.0,2311.0,Default,"[{'Prompt': 'Good evening Chatgpt, I'd like yo...","<!DOCTYPE html>\n<html><head><meta charset=""ut..."
1,https://github.com/labdao/plex/pull/468,labdao/plex,https://chat.openai.com/share/2c4b0dba-429c-4c...,{'MentionedURL': 'https://github.com/labdao/pl...,200,"July 5, 2023",2023-07-27 13:08:49.309610,Gnina Readme for PLEX,6.0,5757.0,1634.0,Default,"[{'Prompt': 'Good evening Chatgpt, I'd like yo...","<!DOCTYPE html>\n<html><head><meta charset=""ut..."
2,https://github.com/ActivityWatch/aw-webui/pull...,ActivityWatch/aw-webui,https://chat.openai.com/share/0c7588ee-b13b-41...,{'MentionedURL': 'https://github.com/ActivityW...,200,"June 22, 2023",2023-07-27 13:08:50.672270,ScreenTime Tracker Algorithm,4.0,463.0,2044.0,Default,[{'Prompt': 'I'm building an app that tracks s...,"<!DOCTYPE html>\n<html><head><meta charset=""ut..."
3,https://github.com/open-learning-exchange/mypl...,open-learning-exchange/myplanet,https://chat.openai.com/share/be516fdf-e0e6-46...,{'MentionedURL': 'https://github.com/open-lear...,200,"July 17, 2023",2023-07-27 13:08:51.530572,New chat,1.0,753.0,278.0,Default,"[{'Prompt': 'convert to french <string name=""s...","<!DOCTYPE html>\n<html><head><meta charset=""ut..."
4,https://github.com/open-learning-exchange/mypl...,open-learning-exchange/myplanet,https://chat.openai.com/share/f194daa6-1c52-49...,{'MentionedURL': 'https://github.com/open-lear...,200,"July 17, 2023",2023-07-27 13:08:53.783497,String to French,9.0,20014.0,3745.0,Default,"[{'Prompt': 'convert string to french', 'Answe...","<!DOCTYPE html>\n<html><head><meta charset=""ut..."


In [56]:
df_chatgpt_sharing["Mention"][0]

{'MentionedURL': 'https://github.com/labdao/plex/pull/469',
 'MentionedProperty': 'body',
 'MentionedAuthor': 'AdamGoyer',
 'MentionedText': 'The Chatgpt Thread used to create this pull request:\r\nhttps://chat.openai.com/share/8bd33825-e8c6-447f-b86e-5827453f3448'}

## Mention table

`MentionedURL` is identical with `URL_pr`

In [70]:
# mention_columns = [
#     "URL_pr", "RepoName", "URL_chatgptsharing",
#     "MentionedURL", "MentionedProperty", "MentionedAuthor", "MentionedText", 
#     "MentionedPath", "MentionedIsAnswer", "MentionedUpvoteCount"
#     ]

records = []

for idx, row in df_chatgpt_sharing[
    ["URL_pr", "RepoName", "URL_chatgptsharing", "Mention"]
    ].iterrows():
    obs = {}
    obs["URL_pr"] = row["URL_pr"]
    obs["RepoName"] = row["RepoName"]
    obs["URL_chatgptsharing"] = row["URL_chatgptsharing"]
    obs.update(row["Mention"])
    records.append(obs)

df_mention = pd.DataFrame(records)
df_mention.head()


Unnamed: 0,URL_pr,RepoName,URL_chatgptsharing,MentionedURL,MentionedProperty,MentionedAuthor,MentionedText,MentionedPath
0,https://github.com/labdao/plex/pull/469,labdao/plex,https://chat.openai.com/share/8bd33825-e8c6-44...,https://github.com/labdao/plex/pull/469,body,AdamGoyer,The Chatgpt Thread used to create this pull re...,
1,https://github.com/labdao/plex/pull/468,labdao/plex,https://chat.openai.com/share/2c4b0dba-429c-4c...,https://github.com/labdao/plex/pull/468,body,AdamGoyer,Link to the ChatGPT conversation used to creat...,
2,https://github.com/ActivityWatch/aw-webui/pull...,ActivityWatch/aw-webui,https://chat.openai.com/share/0c7588ee-b13b-41...,https://github.com/ActivityWatch/aw-webui/pull...,body,ErikBjare,Came up with this while thinking about the bug...,
3,https://github.com/open-learning-exchange/mypl...,open-learning-exchange/myplanet,https://chat.openai.com/share/be516fdf-e0e6-46...,https://github.com/open-learning-exchange/mypl...,body,Okuro3499,this pull request contains french translations...,
4,https://github.com/open-learning-exchange/mypl...,open-learning-exchange/myplanet,https://chat.openai.com/share/f194daa6-1c52-49...,https://github.com/open-learning-exchange/mypl...,body,Okuro3499,this pull request contains french translations...,


## Conversation Table

In [71]:
conversation_columns = ["Prompt", "Answer", "ListOfCode"]

# Create Mention table.
conversation_columns += ["URL_pr", "RepoName", "URL_chatgptsharing"]
records = []

for idx, row in df_chatgpt_sharing[
    ["URL_pr", "RepoName", "URL_chatgptsharing", "Conversations"]
    ].iterrows():
    for item in row["Conversations"]:
        obs = {}
        obs["URL_pr"] = row["URL_pr"]
        obs["RepoName"] = row["RepoName"]
        obs["URL_chatgptsharing"] = row["URL_chatgptsharing"]
        obs.update(item)
        records.append(obs)
    break

df_conversation = pd.DataFrame(records)
df_conversation.head()

Unnamed: 0,URL_pr,RepoName,URL_chatgptsharing,Prompt,Answer,ListOfCode
0,https://github.com/labdao/plex/pull/469,labdao/plex,https://chat.openai.com/share/8bd33825-e8c6-44...,"Good evening Chatgpt,\nI'd like your help to w...",Thanks for sharing the README file for Open Ba...,[]
1,https://github.com/labdao/plex/pull/469,labdao/plex,https://chat.openai.com/share/8bd33825-e8c6-44...,"Here is the PLEX readme, I think this will ans...",Thanks for sharing the README file for the PLE...,[]
2,https://github.com/labdao/plex/pull/469,labdao/plex,https://chat.openai.com/share/8bd33825-e8c6-44...,Essencially PLEX loads an openbabel docker con...,"Based on the given information, it looks like ...","[{'ReplaceString': '[CODE_BLOCK_0]', 'Type': '..."
3,https://github.com/labdao/plex/pull/469,labdao/plex,https://chat.openai.com/share/8bd33825-e8c6-44...,"Excellent work chat, I think we should also ex...",These JSON files are indeed configuration file...,[]
4,https://github.com/labdao/plex/pull/469,labdao/plex,https://chat.openai.com/share/8bd33825-e8c6-44...,"Noted,\nTake a look at this docker file, and t...",This Dockerfile is used to create a Docker ima...,[]


## Clean Up
4 tables: `df_pr`, `df_chatgpt_sharing`, `df_mention`, `df_conversation`

In [72]:
# Remove redundant columns
df = df.drop(columns="ChatgptSharing")
df_chatgpt_sharing = df_chatgpt_sharing.drop(columns=["Mention", "Conversations"])

## Save to file

In [73]:
df.to_csv(os.path.join(dir_path, "cleaned", "pr.csv"))
df_chatgpt_sharing.to_csv(os.path.join(dir_path, "cleaned", "pr_chatgpt_sharing.csv"))
df_mention.to_csv(os.path.join(dir_path, "cleaned", "pr_mention.csv"))
df_conversation.to_csv(os.path.join(dir_path, "cleaned", "pr_conversation.csv"))

# Combine to a big dataframe

In [80]:
# Merge ChatGptSharing table to PR table.
df_total = pd.merge(
    df, df_chatgpt_sharing,
    left_on=["URL_pr", "RepoName"], 
    right_on=["URL_pr", "RepoName"], 
    how="left"
)



In [83]:
# Merge Mention table to PR table.
df_total = pd.merge(
    df_total, df_mention,
    left_on=["URL_pr", "RepoName"], 
    right_on=["URL_pr", "RepoName"], 
    how="left"
)

In [86]:
# Merge Conversation table to PR table.
df_total = pd.merge(
    df_total, df_conversation,
    left_on=["URL_pr", "RepoName"], 
    right_on=["URL_pr", "RepoName"], 
    how="left"
)

In [87]:
df_total.columns

Index(['Type', 'URL_pr', 'Author', 'RepoName', 'RepoLanguage', 'Number',
       'Title_x', 'Body', 'CreatedAt', 'ClosedAt', 'MergedAt', 'UpdatedAt',
       'State', 'Additions', 'Deletions', 'ChangedFiles', 'CommitsTotalCount',
       'CommitShas', 'CommitSha', 'source_date', 'URL_chatgptsharing_x',
       'Status', 'DateOfConversation', 'DateOfAccess', 'Title_y',
       'NumberOfPrompts', 'TokensOfPrompts', 'TokensOfAnswers', 'Model',
       'HTMLContent', 'URL_chatgptsharing_y', 'MentionedURL',
       'MentionedProperty', 'MentionedAuthor', 'MentionedText',
       'MentionedPath', 'URL_chatgptsharing', 'Prompt', 'Answer',
       'ListOfCode'],
      dtype='object')

In [90]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246 entries, 0 to 245
Data columns (total 40 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Type                  246 non-null    object 
 1   URL_pr                246 non-null    object 
 2   Author                246 non-null    object 
 3   RepoName              246 non-null    object 
 4   RepoLanguage          243 non-null    object 
 5   Number                246 non-null    int64  
 6   Title_x               246 non-null    object 
 7   Body                  246 non-null    object 
 8   CreatedAt             246 non-null    object 
 9   ClosedAt              221 non-null    object 
 10  MergedAt              187 non-null    object 
 11  UpdatedAt             246 non-null    object 
 12  State                 246 non-null    object 
 13  Additions             246 non-null    int64  
 14  Deletions             246 non-null    int64  
 15  ChangedFiles          2

In [91]:
# Save to file.
df_conversation.to_csv(os.path.join(dir_path, "cleaned", "pr_total.csv"))