# Pull Request Table - Data Cleaning

In [1]:
import pandas as pd
import helper
import os

In [2]:
# Read all json file paths.
snapshots = [
    "snapshot_20230727", "snapshot_20230803", "snapshot_20230810", 
    "snapshot_20230817", "snapshot_20230824", "snapshot_20230831"
    ]
dir_path = "/Users/teng/UBCO/mds_labs/block4/542/data/DevGPT/"

file_paths = []
for snapshot in snapshots:
    file_path = os.path.join(dir_path, snapshot)
    file_paths += helper.read_filepaths(file_path, "pr")

In [3]:
# Load Json files.
df = helper.load_dataframes(file_paths)

# Rename URL to URL_pr.
df.rename(columns={"URL": "URL_pr"}, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1008 entries, 0 to 192
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Type               1008 non-null   object
 1   URL_pr             1008 non-null   object
 2   Author             1008 non-null   object
 3   RepoName           1008 non-null   object
 4   RepoLanguage       990 non-null    object
 5   Number             1008 non-null   int64 
 6   Title              1008 non-null   object
 7   Body               1008 non-null   object
 8   CreatedAt          1008 non-null   object
 9   ClosedAt           902 non-null    object
 10  MergedAt           765 non-null    object
 11  UpdatedAt          1008 non-null   object
 12  State              1008 non-null   object
 13  Additions          1008 non-null   int64 
 14  Deletions          1008 non-null   int64 
 15  ChangedFiles       1008 non-null   int64 
 16  CommitsTotalCount  1008 non-null   int64 
 17  C

## ChatGPTSharing Table

In [5]:
# chatgpt_sharing_columns = [
#     "URL", "Mention", "Status", "DateOfConversation", "DateOfAccess", 
#     "NumberOfPrompts", "TokensOfPrompts", "TokensOfAnswers", "Model", 
#     "Conversations", "HTMLContent"
#     ]

# Create ChatGPTSharing table.
# chatgpt_sharing_columns += ["URL_Pr", "RepoName"]
records = []

for idx, row in df[["URL_pr", "RepoName", "ChatgptSharing"]].iterrows():
    for item in row["ChatgptSharing"]:
        obs = {}
        obs["URL_pr"] = row["URL_pr"]
        obs["RepoName"] = row["RepoName"]
        obs.update(item)
        records.append(obs)

df_chatgpt_sharing = pd.DataFrame(records)

# Rename URL to URL_chatgptsharing.
df_chatgpt_sharing.rename(columns={"URL": "URL_chatgptsharing"}, inplace=True)
df_chatgpt_sharing.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1169 entries, 0 to 1168
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   URL_pr              1169 non-null   object 
 1   RepoName            1169 non-null   object 
 2   URL_chatgptsharing  1169 non-null   object 
 3   Mention             1169 non-null   object 
 4   Status              1169 non-null   int64  
 5   DateOfConversation  1090 non-null   object 
 6   DateOfAccess        1090 non-null   object 
 7   Title               1090 non-null   object 
 8   NumberOfPrompts     1090 non-null   float64
 9   TokensOfPrompts     1090 non-null   float64
 10  TokensOfAnswers     1090 non-null   float64
 11  Model               1090 non-null   object 
 12  Conversations       1090 non-null   object 
 13  HTMLContent         1090 non-null   object 
dtypes: float64(3), int64(1), object(10)
memory usage: 128.0+ KB


In [6]:
df_chatgpt_sharing["Mention"][0]

{'MentionedURL': 'https://github.com/labdao/plex/pull/469',
 'MentionedProperty': 'body',
 'MentionedAuthor': 'AdamGoyer',
 'MentionedText': 'The Chatgpt Thread used to create this pull request:\r\nhttps://chat.openai.com/share/8bd33825-e8c6-447f-b86e-5827453f3448'}

## Mention table

`MentionedURL` is identical with `URL_pr`

In [7]:
# mention_columns = [
#     "URL_pr", "RepoName", "URL_chatgptsharing",
#     "MentionedURL", "MentionedProperty", "MentionedAuthor", "MentionedText", 
#     "MentionedPath", "MentionedIsAnswer", "MentionedUpvoteCount"
#     ]

records = []

for idx, row in df_chatgpt_sharing[
    ["URL_pr", "RepoName", "URL_chatgptsharing", "Mention"]
    ].iterrows():
    obs = {}
    obs["URL_pr"] = row["URL_pr"]
    obs["RepoName"] = row["RepoName"]
    obs["URL_chatgptsharing"] = row["URL_chatgptsharing"]
    obs.update(row["Mention"])
    records.append(obs)

df_mention = pd.DataFrame(records)
df_mention.head()


Unnamed: 0,URL_pr,RepoName,URL_chatgptsharing,MentionedURL,MentionedProperty,MentionedAuthor,MentionedText,MentionedPath
0,https://github.com/labdao/plex/pull/469,labdao/plex,https://chat.openai.com/share/8bd33825-e8c6-44...,https://github.com/labdao/plex/pull/469,body,AdamGoyer,The Chatgpt Thread used to create this pull re...,
1,https://github.com/labdao/plex/pull/468,labdao/plex,https://chat.openai.com/share/2c4b0dba-429c-4c...,https://github.com/labdao/plex/pull/468,body,AdamGoyer,Link to the ChatGPT conversation used to creat...,
2,https://github.com/ActivityWatch/aw-webui/pull...,ActivityWatch/aw-webui,https://chat.openai.com/share/0c7588ee-b13b-41...,https://github.com/ActivityWatch/aw-webui/pull...,body,ErikBjare,Came up with this while thinking about the bug...,
3,https://github.com/open-learning-exchange/mypl...,open-learning-exchange/myplanet,https://chat.openai.com/share/be516fdf-e0e6-46...,https://github.com/open-learning-exchange/mypl...,body,Okuro3499,this pull request contains french translations...,
4,https://github.com/open-learning-exchange/mypl...,open-learning-exchange/myplanet,https://chat.openai.com/share/f194daa6-1c52-49...,https://github.com/open-learning-exchange/mypl...,body,Okuro3499,this pull request contains french translations...,


## Conversation Table

In [8]:
# conversation_columns = ["Prompt", "Answer", "ListOfCode"]

# Create Mention table.
# conversation_columns += ["URL_pr", "RepoName", "URL_chatgptsharing"]
records = []

for idx, row in df_chatgpt_sharing[
    ["URL_pr", "RepoName", "URL_chatgptsharing", "Conversations"]
    ].iterrows():
    for item in row["Conversations"]:
        obs = {}
        obs["URL_pr"] = row["URL_pr"]
        obs["RepoName"] = row["RepoName"]
        obs["URL_chatgptsharing"] = row["URL_chatgptsharing"]
        obs.update(item)
        records.append(obs)
    break

df_conversation = pd.DataFrame(records)
df_conversation.head()

Unnamed: 0,URL_pr,RepoName,URL_chatgptsharing,Prompt,Answer,ListOfCode
0,https://github.com/labdao/plex/pull/469,labdao/plex,https://chat.openai.com/share/8bd33825-e8c6-44...,"Good evening Chatgpt,\nI'd like your help to w...",Thanks for sharing the README file for Open Ba...,[]
1,https://github.com/labdao/plex/pull/469,labdao/plex,https://chat.openai.com/share/8bd33825-e8c6-44...,"Here is the PLEX readme, I think this will ans...",Thanks for sharing the README file for the PLE...,[]
2,https://github.com/labdao/plex/pull/469,labdao/plex,https://chat.openai.com/share/8bd33825-e8c6-44...,Essencially PLEX loads an openbabel docker con...,"Based on the given information, it looks like ...","[{'ReplaceString': '[CODE_BLOCK_0]', 'Type': '..."
3,https://github.com/labdao/plex/pull/469,labdao/plex,https://chat.openai.com/share/8bd33825-e8c6-44...,"Excellent work chat, I think we should also ex...",These JSON files are indeed configuration file...,[]
4,https://github.com/labdao/plex/pull/469,labdao/plex,https://chat.openai.com/share/8bd33825-e8c6-44...,"Noted,\nTake a look at this docker file, and t...",This Dockerfile is used to create a Docker ima...,[]


## Clean Up
4 tables: `df_pr`, `df_chatgpt_sharing`, `df_mention`, `df_conversation`

In [9]:
# Remove redundant columns
df = df.drop(columns="ChatgptSharing")
df_chatgpt_sharing = df_chatgpt_sharing.drop(columns=["Mention", "Conversations"])

## Save to file

In [10]:
df.to_csv(os.path.join(dir_path, "cleaned", "pr.csv"))
df_chatgpt_sharing.to_csv(os.path.join(dir_path, "cleaned", "pr_chatgpt_sharing.csv"))
df_mention.to_csv(os.path.join(dir_path, "cleaned", "pr_mention.csv"))
df_conversation.to_csv(os.path.join(dir_path, "cleaned", "pr_conversation.csv"))

# Combine to a big dataframe

In [11]:
# Merge ChatGptSharing table to PR table.
df_total = pd.merge(
    df, df_chatgpt_sharing,
    left_on=["URL_pr", "RepoName"], 
    right_on=["URL_pr", "RepoName"], 
    how="left"
)

In [12]:
# Merge Mention table to PR table.
df_total = pd.merge(
    df_total, df_mention,
    left_on=["URL_pr", "RepoName"], 
    right_on=["URL_pr", "RepoName"], 
    how="left"
)

In [13]:
# Merge Conversation table to PR table.
df_total = pd.merge(
    df_total, df_conversation,
    left_on=["URL_pr", "RepoName"], 
    right_on=["URL_pr", "RepoName"], 
    how="left"
)

In [14]:
df_total.columns

Index(['Type', 'URL_pr', 'Author', 'RepoName', 'RepoLanguage', 'Number',
       'Title_x', 'Body', 'CreatedAt', 'ClosedAt', 'MergedAt', 'UpdatedAt',
       'State', 'Additions', 'Deletions', 'ChangedFiles', 'CommitsTotalCount',
       'CommitShas', 'CommitSha', 'source_date', 'URL_chatgptsharing_x',
       'Status', 'DateOfConversation', 'DateOfAccess', 'Title_y',
       'NumberOfPrompts', 'TokensOfPrompts', 'TokensOfAnswers', 'Model',
       'HTMLContent', 'URL_chatgptsharing_y', 'MentionedURL',
       'MentionedProperty', 'MentionedAuthor', 'MentionedText',
       'MentionedPath', 'URL_chatgptsharing', 'Prompt', 'Answer',
       'ListOfCode'],
      dtype='object')

In [15]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53415 entries, 0 to 53414
Data columns (total 40 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Type                  53415 non-null  object 
 1   URL_pr                53415 non-null  object 
 2   Author                53415 non-null  object 
 3   RepoName              53415 non-null  object 
 4   RepoLanguage          52767 non-null  object 
 5   Number                53415 non-null  int64  
 6   Title_x               53415 non-null  object 
 7   Body                  53415 non-null  object 
 8   CreatedAt             53415 non-null  object 
 9   ClosedAt              49042 non-null  object 
 10  MergedAt              41371 non-null  object 
 11  UpdatedAt             53415 non-null  object 
 12  State                 53415 non-null  object 
 13  Additions             53415 non-null  int64  
 14  Deletions             53415 non-null  int64  
 15  ChangedFiles       

In [16]:
# Save to file.
df_total.to_csv(os.path.join(dir_path, "cleaned", "pr_total.csv"))