### Importing Libraries and Data and Initialize Functions

In [1]:
import altair as alt
import pandas as pd
import numpy as np
from langdetect import detect
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

def is_english(row):
    try:
        return detect(row['Prompt']) == 'en' and detect(row['Answer']) == 'en'
    except:
        return False

def count_tokens(sentence):
    doc = nlp(sentence)
    return len([token for token in doc])

In [3]:
discussion = pd.read_csv('/Users/red/Desktop/CLONERepo/DevGPT/cleaned/discussion_total.csv')

### Data Overview

In [4]:
discussion.head(1)

Unnamed: 0,Type,URL_discussion,Author,RepoName,RepoLanguage,Number,Title_x,Body,CreatedAt,ClosedAt,...,MentionedURL,MentionedProperty,MentionedAuthor,MentionedText,MentionedIsAnswer,MentionedUpvoteCount,URL_chatgptsharing,Prompt,Answer,ListOfCode
0,discussion,https://github.com/orgs/deep-foundation/discus...,FreePhoenix888,deep-foundation/Discussions,,27,Should we worry about imports perfomance in ha...,My dialogue with chatgpt about it: https://cha...,2023-07-11T11:31:00Z,,...,https://github.com/deep-foundation/Discussions...,comments.body,Konard,https://chat.openai.com/share/1e0f86ff-2094-44...,False,1.0,https://chat.openai.com/share/1e0f86ff-2094-44...,Can I always use await import instead of plain...,The use of await import(...) and import ... in...,[]


In [5]:
discussion.columns

Index(['Type', 'URL_discussion', 'Author', 'RepoName', 'RepoLanguage',
       'Number', 'Title_x', 'Body', 'CreatedAt', 'ClosedAt', 'UpdatedAt',
       'Closed', 'UpvoteCount', 'source_date', 'URL_chatgptsharing_x',
       'Status', 'DateOfConversation', 'DateOfAccess', 'Title_y',
       'NumberOfPrompts', 'TokensOfPrompts', 'TokensOfAnswers', 'Model',
       'HTMLContent', 'URL_chatgptsharing_y', 'MentionedURL',
       'MentionedProperty', 'MentionedAuthor', 'MentionedText',
       'MentionedIsAnswer', 'MentionedUpvoteCount', 'URL_chatgptsharing',
       'Prompt', 'Answer', 'ListOfCode'],
      dtype='object')

### Filtering and preprocessing the data

In [6]:
filtered_discussion = discussion.query("RepoLanguage == 'Python'")
filtered_discussion = filtered_discussion[filtered_discussion.apply(is_english, axis=1)]

In [7]:
filtered_discussion.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61 entries, 16 to 312
Data columns (total 35 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Type                  61 non-null     object 
 1   URL_discussion        61 non-null     object 
 2   Author                61 non-null     object 
 3   RepoName              61 non-null     object 
 4   RepoLanguage          61 non-null     object 
 5   Number                61 non-null     int64  
 6   Title_x               61 non-null     object 
 7   Body                  61 non-null     object 
 8   CreatedAt             61 non-null     object 
 9   ClosedAt              0 non-null      object 
 10  UpdatedAt             61 non-null     object 
 11  Closed                61 non-null     bool   
 12  UpvoteCount           61 non-null     int64  
 13  source_date           61 non-null     int64  
 14  URL_chatgptsharing_x  61 non-null     object 
 15  Status                6

In [8]:
filtered_discussion['Prompt_tokens'] = filtered_discussion['Prompt'].apply(count_tokens)
filtered_discussion['Answer_tokens'] = filtered_discussion['Answer'].apply(count_tokens)

In [9]:
filtered_discussion.head(1)

Unnamed: 0,Type,URL_discussion,Author,RepoName,RepoLanguage,Number,Title_x,Body,CreatedAt,ClosedAt,...,MentionedAuthor,MentionedText,MentionedIsAnswer,MentionedUpvoteCount,URL_chatgptsharing,Prompt,Answer,ListOfCode,Prompt_tokens,Answer_tokens
16,discussion,https://github.com/dtch1997/gpt-text-gym/discu...,dtch1997,dtch1997/gpt-text-gym,Python,7,GPT decomposing missions using functions,Message 1: Environment description\r\nMessage ...,2023-06-26T19:30:34Z,,...,dtch1997,Message 1: Environment description\r\nMessage ...,False,1.0,https://chat.openai.com/share/1ee48447-8296-4a...,You are an agent in a gridworld.\nThe environm...,"Yes, I understand the rules and structure of t...",[],601,13


In [10]:
print("filtered discussion data has")
print(str(len(filtered_discussion)) + " rows")
print(str(len(filtered_discussion['URL_discussion'].unique())) + " unique URLs")

filtered discussion data has
61 rows
8 unique URLs


### Data analysis and visualization

#### Average Length of Tokens for Prompts and Answers per Issue

In [46]:
total_length_prompt_per_issue = pd.DataFrame(filtered_discussion.groupby(['URL_discussion', 'RepoName'])['Prompt_tokens'].sum())
total_length_prompt_per_issue.reset_index(inplace=True)
total_length_answer_per_issue = pd.DataFrame(filtered_discussion.groupby(['URL_discussion', 'RepoName'])['Answer_tokens'].sum())
total_length_answer_per_issue.reset_index(inplace=True)

average_length_prompt = total_length_prompt_per_issue['Prompt_tokens'].mean()
average_length_answer = total_length_answer_per_issue['Answer_tokens'].mean()

print("average length of prompt: " + str(round(average_length_prompt, 2)))
print("average length of answer: " + str(round(average_length_answer, 2)))

average length of prompt: 1573.62
average length of answer: 1230.62


In [48]:
prompt_length_per_issue_chart = alt.Chart(total_length_prompt_per_issue).mark_bar().encode(
    alt.X('Prompt_tokens:Q'),
    alt.Y('URL_discussion', sort='-x', title='Issue URL'),
    alt.Color('RepoName', title='Repository')
).properties(
    title='Prompt length per issue'
)
answer_length_per_issue_chart = alt.Chart(total_length_answer_per_issue).mark_bar().encode(
    alt.X('Answer_tokens:Q'),
    alt.Y('URL_discussion', sort='-x', title='Issue URL'),
     alt.Color('RepoName', title='Repository')
).properties(
    title='Answer length per issue'
)

prompt_length_per_issue_chart & answer_length_per_issue_chart

#### Average Length of Tokens for Prompts and Answers per Repository

In [50]:
prompt_length_per_issue_chart = alt.Chart(total_length_prompt_per_issue).mark_bar().encode(
    alt.X('Prompt_tokens:Q'),
    alt.Y('RepoName', sort='-x', title='Issue URL'),
).properties(
    title='Prompt length per issue'
)
answer_length_per_issue_chart = alt.Chart(total_length_answer_per_issue).mark_bar().encode(
    alt.X('Answer_tokens:Q'),
    alt.Y('RepoName', sort='-x', title='Issue URL'),
).properties(
    title='Answer length per issue'
)

prompt_length_per_issue_chart & answer_length_per_issue_chart