### Importing Libraries and Data and Initialize Functions

In [1]:
import altair as alt
import pandas as pd
import numpy as np
from langdetect import detect
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

def is_english(row):
    try:
        return detect(row['Prompt']) == 'en' and detect(row['Answer']) == 'en'
    except:
        return False

def count_tokens(sentence):
    doc = nlp(sentence)
    return len([token for token in doc])

In [3]:
discussion = pd.read_csv('/Users/red/Desktop/CLONERepo/DevGPT/cleaned/discussion_total.csv')

### Data Overview

In [4]:
discussion.head(1)

Unnamed: 0,Type,URL_discussion,Author,RepoName,RepoLanguage,Number,Title_x,Body,CreatedAt,ClosedAt,...,MentionedURL,MentionedProperty,MentionedAuthor,MentionedText,MentionedIsAnswer,MentionedUpvoteCount,URL_chatgptsharing,Prompt,Answer,ListOfCode
0,discussion,https://github.com/orgs/deep-foundation/discus...,FreePhoenix888,deep-foundation/Discussions,,27,Should we worry about imports perfomance in ha...,My dialogue with chatgpt about it: https://cha...,2023-07-11T11:31:00Z,,...,https://github.com/deep-foundation/Discussions...,comments.body,Konard,https://chat.openai.com/share/1e0f86ff-2094-44...,False,1.0,https://chat.openai.com/share/1e0f86ff-2094-44...,Can I always use await import instead of plain...,The use of await import(...) and import ... in...,[]


In [5]:
discussion.columns

Index(['Type', 'URL_discussion', 'Author', 'RepoName', 'RepoLanguage',
       'Number', 'Title_x', 'Body', 'CreatedAt', 'ClosedAt', 'UpdatedAt',
       'Closed', 'UpvoteCount', 'source_date', 'URL_chatgptsharing_x',
       'Status', 'DateOfConversation', 'DateOfAccess', 'Title_y',
       'NumberOfPrompts', 'TokensOfPrompts', 'TokensOfAnswers', 'Model',
       'HTMLContent', 'URL_chatgptsharing_y', 'MentionedURL',
       'MentionedProperty', 'MentionedAuthor', 'MentionedText',
       'MentionedIsAnswer', 'MentionedUpvoteCount', 'URL_chatgptsharing',
       'Prompt', 'Answer', 'ListOfCode'],
      dtype='object')

### Filtering and preprocessing the data

In [6]:
filtered_discussion = discussion.query("RepoLanguage == 'Python'")
filtered_discussion = filtered_discussion[filtered_discussion.apply(is_english, axis=1)]

In [7]:
filtered_discussion.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60 entries, 16 to 312
Data columns (total 35 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Type                  60 non-null     object 
 1   URL_discussion        60 non-null     object 
 2   Author                60 non-null     object 
 3   RepoName              60 non-null     object 
 4   RepoLanguage          60 non-null     object 
 5   Number                60 non-null     int64  
 6   Title_x               60 non-null     object 
 7   Body                  60 non-null     object 
 8   CreatedAt             60 non-null     object 
 9   ClosedAt              0 non-null      object 
 10  UpdatedAt             60 non-null     object 
 11  Closed                60 non-null     bool   
 12  UpvoteCount           60 non-null     int64  
 13  source_date           60 non-null     int64  
 14  URL_chatgptsharing_x  60 non-null     object 
 15  Status                6

In [8]:
filtered_discussion['Prompt_tokens'] = filtered_discussion['Prompt'].apply(count_tokens)
filtered_discussion['Answer_tokens'] = filtered_discussion['Answer'].apply(count_tokens)

In [9]:
filtered_discussion.head(1)

Unnamed: 0,Type,URL_discussion,Author,RepoName,RepoLanguage,Number,Title_x,Body,CreatedAt,ClosedAt,...,MentionedAuthor,MentionedText,MentionedIsAnswer,MentionedUpvoteCount,URL_chatgptsharing,Prompt,Answer,ListOfCode,Prompt_tokens,Answer_tokens
16,discussion,https://github.com/dtch1997/gpt-text-gym/discu...,dtch1997,dtch1997/gpt-text-gym,Python,7,GPT decomposing missions using functions,Message 1: Environment description\r\nMessage ...,2023-06-26T19:30:34Z,,...,dtch1997,Message 1: Environment description\r\nMessage ...,False,1.0,https://chat.openai.com/share/1ee48447-8296-4a...,You are an agent in a gridworld.\nThe environm...,"Yes, I understand the rules and structure of t...",[],601,13


In [10]:
print("filtered discussion data has")
print(str(len(filtered_discussion)) + " rows")
print(str(len(filtered_discussion['URL_discussion'].unique())) + " unique URLs")

filtered discussion data has
60 rows
8 unique URLs


In [20]:
filtered_discussion = filtered_discussion[['URL_discussion', 'RepoName', 'CreatedAt', 'ClosedAt', 'UpdatedAt', 'Closed']]
filtered_discussion.query('Closed == True')

Unnamed: 0,URL_discussion,RepoName,CreatedAt,ClosedAt,UpdatedAt,Closed
16,https://github.com/dtch1997/gpt-text-gym/discu...,dtch1997/gpt-text-gym,2023-06-26T19:30:34Z,,2023-06-26T19:36:50Z,False
17,https://github.com/dtch1997/gpt-text-gym/discu...,dtch1997/gpt-text-gym,2023-06-26T19:30:34Z,,2023-06-26T19:36:50Z,False
18,https://github.com/dtch1997/gpt-text-gym/discu...,dtch1997/gpt-text-gym,2023-06-26T19:30:34Z,,2023-06-26T19:36:50Z,False
19,https://github.com/dtch1997/gpt-text-gym/discu...,dtch1997/gpt-text-gym,2023-06-26T19:30:34Z,,2023-06-26T19:36:50Z,False
20,https://github.com/dtch1997/gpt-text-gym/discu...,dtch1997/gpt-text-gym,2023-06-26T19:30:34Z,,2023-06-26T19:36:50Z,False
21,https://github.com/dtch1997/gpt-text-gym/discu...,dtch1997/gpt-text-gym,2023-06-26T19:30:34Z,,2023-06-26T19:36:50Z,False
22,https://github.com/dtch1997/gpt-text-gym/discu...,dtch1997/gpt-text-gym,2023-06-26T19:30:34Z,,2023-06-26T19:36:50Z,False
23,https://github.com/dtch1997/gpt-text-gym/discu...,dtch1997/gpt-text-gym,2023-06-26T19:30:34Z,,2023-06-26T19:36:50Z,False
24,https://github.com/dtch1997/gpt-text-gym/discu...,dtch1997/gpt-text-gym,2023-06-26T19:30:34Z,,2023-06-26T19:36:50Z,False
25,https://github.com/dtch1997/gpt-text-gym/discu...,dtch1997/gpt-text-gym,2023-06-26T19:30:34Z,,2023-06-26T19:36:50Z,False


In [24]:
filtered_discussion.query('Closed == True')

Unnamed: 0,URL_discussion,RepoName,CreatedAt,ClosedAt,UpdatedAt,Closed


In [23]:
filtered_discussion.dropna(subset=['ClosedAt'])

Unnamed: 0,URL_discussion,RepoName,CreatedAt,ClosedAt,UpdatedAt,Closed


### Data analysis and visualization

In [11]:
interactions_per_issue = filtered_discussion['URL_discussion'].value_counts()
avergae_interactions_per_issue = interactions_per_issue.mean()

print("average interactions per issue: " + str(avergae_interactions_per_issue))

average interactions per issue: 7.5
