# Extract all conversations

In [None]:
from conversation_extractor import ConversationExtractor
import os

extractor = ConversationExtractor()

for subdir, dirs, files in os.walk(f"../data/raw"):
    for file in files:
        if (extractor.is_csv_file(file)):
            continue

        path = os.path.join(subdir, file)
        print(f"Extracting conversations from {path}...")
        conversations_by_url = extractor.extract_conversations_by_url(
            path, print_process=False)
        extractor.save_conversations(path, conversations_by_url)

# Filter conversations

In [None]:
from conversation_filter import ConversationFilter
import os

conversation_filter = ConversationFilter()
print_process = False

conversations_with_code = {}
python_conversation = {}
js_conversation = {}
ts_conversation = {}
java_conversation = {}

# Read every file of the data/interim/conversations folder
for subdir, dirs, files in os.walk(f"../data/interim/conversations"):
    for file in files:
        path = os.path.join(subdir, file)
        print(f"Filtering conversations from {path}...")
        conversations_by_url = conversation_filter.load_conversations(path)

        new_conversations_with_code = conversation_filter.get_conversations_with_code(
            conversations_by_url, print_process)
        new_python_conversation = conversation_filter.get_python_conversations(
            conversations_by_url, print_process)
        new_js_conversation = conversation_filter.get_js_conversations(
            conversations_by_url, print_process)
        new_ts_conversation = conversation_filter.get_ts_conversations(
            conversations_by_url, print_process)
        new_java_conversation = conversation_filter.get_java_conversations(
            conversations_by_url, print_process)
        
        conversations_with_code.update(new_conversations_with_code)
        python_conversation.update(new_python_conversation)
        js_conversation.update(new_js_conversation)
        ts_conversation.update(new_ts_conversation)
        java_conversation.update(new_java_conversation)


conversation_filter.save_conversations(
    conversations_with_code, 'with-code')
conversation_filter.save_conversations(python_conversation, 'python')
conversation_filter.save_conversations(js_conversation, 'javascript')
conversation_filter.save_conversations(ts_conversation, 'typescript')
conversation_filter.save_conversations(java_conversation, 'java')

# Extract Code Snippets as Source Code files

In [2]:
from conversation_io import ConversationIO
from source_code_extractor import SourceCodeExtractor

conversation_io = ConversationIO()
source_code_extractor = SourceCodeExtractor()

url = "../data/interim/filtered-conversations"
origin = "chatgpt"
type = "python"
print_process = False

conversations_by_url = conversation_io.load_conversations(f"{url}/conversations-{type}.json")

source_codes = source_code_extractor.extract(conversations_by_url)

source_code_extractor.export_source_code(origin, source_codes, type)

source_code_extractor.delete_invalid_files(origin, type)


Exporting 653 source codes to ../data/interim/chatgpt/src/python
Deleting 126.py
Deleting 127.py
Deleting 128.py
Deleting 129.py
Deleting 130.py
Deleting 131.py
Deleting 132.py
Deleting 176.py
Deleting 203.py
Deleting 212.py
Deleting 223.py
Deleting 226.py
Deleting 303.py
Deleting 307.py
Deleting 308.py
Deleting 309.py
Deleting 311.py
Deleting 33.py
Deleting 368.py
Deleting 399.py
Deleting 400.py
Deleting 401.py
Deleting 440.py
Deleting 465.py
Deleting 488.py
Deleting 489.py
Deleting 55.py
Deleting 572.py
Deleting 573.py
Deleting 624.py
Deleting 635.py
Deleting 72.py
Deleting 76.py
Deleting 77.py
Deleting 78.py
Deleting 79.py
Deleting 95.py
Deleting 96.py


# Fetch Questions from Stack Overflow

In [1]:
from so_extractor import StackOverflowExtractor

raw_dir = "../data/raw/stackoverflow"
interim_dir = "../data/interim/stackoverflow"
so_extractor = StackOverflowExtractor(raw_dir, interim_dir)

type = "python"
number_of_answer = 5
has_accepted_answer = True
nb_of_views = 1000

start_page = 1
max_page = 50

so_extractor.fetch_search(type, start_page, max_page, number_of_answer, has_accepted_answer, nb_of_views)

# Extract Answers from fetched Questions

In [3]:
from so_extractor import StackOverflowExtractor

raw_dir = "../data/raw/stackoverflow"
interim_dir = "../data/interim/stackoverflow"
so_extractor = StackOverflowExtractor(raw_dir, interim_dir)

type = "python"
start_page = 1
max_page = 50

question_ids = so_extractor.extract_question_ids(type)

so_extractor.fetch_answers(start_page, max_page, question_ids, type)

# Extract Code Snippets from Answers

In [1]:
from so_extractor import StackOverflowExtractor
import os

raw_dir = "../data/raw/stackoverflow"
interim_dir = "../data/interim/stackoverflow"
so_extractor = StackOverflowExtractor(raw_dir, interim_dir)

type = "python"

for subdir, dirs, files in os.walk(f"{raw_dir}/answers"):
    for i, file in enumerate(files):
        path = os.path.join(subdir, file)
        print(f"Extracting code from {path}...")

        so_extractor.extract_code_from_answers(path, i, type)

Extracting code from ../data/raw/stackoverflow/answers\python_0.json...
Extracting code from ../data/raw/stackoverflow/answers\python_1.json...
Extracting code from ../data/raw/stackoverflow/answers\python_10.json...
Extracting code from ../data/raw/stackoverflow/answers\python_11.json...
Extracting code from ../data/raw/stackoverflow/answers\python_12.json...
Extracting code from ../data/raw/stackoverflow/answers\python_13.json...
Extracting code from ../data/raw/stackoverflow/answers\python_14.json...
Extracting code from ../data/raw/stackoverflow/answers\python_15.json...
Extracting code from ../data/raw/stackoverflow/answers\python_16.json...
Extracting code from ../data/raw/stackoverflow/answers\python_17.json...
Extracting code from ../data/raw/stackoverflow/answers\python_18.json...
Extracting code from ../data/raw/stackoverflow/answers\python_19.json...
Extracting code from ../data/raw/stackoverflow/answers\python_2.json...
Extracting code from ../data/raw/stackoverflow/answers

# Export Valid Code Snippets to Python Files

In [3]:
from itertools import chain
from source_code_extractor import SourceCodeExtractor
import json
import os

source_code_extractor = SourceCodeExtractor()
raw_dir = "../data/raw/stackoverflow"
interim_dir = "../data/interim/stackoverflow"

origin = "stackoverflow"
type = "python"

for subdir, dirs, files in os.walk(f"{interim_dir}/snippets"):
    for i, file in enumerate(files):
        path = os.path.join(subdir, file)
        print(f"Exporting code from {path}...")

        with open(path) as f:
            snippets = json.load(f)
            snippets = list(chain(*map(list, snippets)))

            valid_snippets = source_code_extractor.filter_valid_source_code(snippets, type)

            source_code_extractor.export_source_code(origin, valid_snippets, type, i)

source_code_extractor.delete_invalid_files(origin, type)

Exporting code from ../data/interim/stackoverflow/snippets\python_0.json...
Exporting 75 source codes to ../data/interim/stackoverflow/src/python
Exporting code from ../data/interim/stackoverflow/snippets\python_1.json...
Exporting 89 source codes to ../data/interim/stackoverflow/src/python
Exporting code from ../data/interim/stackoverflow/snippets\python_10.json...
Exporting 115 source codes to ../data/interim/stackoverflow/src/python
Exporting code from ../data/interim/stackoverflow/snippets\python_11.json...
Exporting 103 source codes to ../data/interim/stackoverflow/src/python
Exporting code from ../data/interim/stackoverflow/snippets\python_12.json...
Exporting 84 source codes to ../data/interim/stackoverflow/src/python
Exporting code from ../data/interim/stackoverflow/snippets\python_13.json...
Exporting 115 source codes to ../data/interim/stackoverflow/src/python
Exporting code from ../data/interim/stackoverflow/snippets\python_14.json...
Exporting 78 source codes to ../data/int