In [1]:
! pip install clean-text[gpl] emoji



In [2]:
import re
import nltk
from nltk.corpus import stopwords
# Baixar stopwords do NLTK
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Função de pré-processamento para issues com remoção de código
def preprocess_issue(text):
    # Remoção de HTML ou tags personalizadas
    text = re.sub(r'<.*?>', '', text)
    
    # Substituição de blocos de código Markdown (``` ou ~~~) por <CODE_BLOCK>
    text = re.sub(r'(```.*?```|~~~.*?~~~)', '<CODE_BLOCK>', text, flags=re.DOTALL)
    
    # Substituição de código inline (delimitado por `) por <CODE_BLOCK>
    text = re.sub(r'`[^`]+`', '<CODE_BLOCK>', text)
    
    # Substituição de referências a issues ou PRs (ex: #123) por <ISSUE_REF>
    text = re.sub(r'#\d+', '<ISSUE_REF>', text)
    
    # Substituição de menções (@username) por <MENTION>
    text = re.sub(r'@\w+', '<MENTION>', text)
    
    # Substituição de links por <LINK>
    text = re.sub(r'http[s]?://\S+', '<LINK>', text)
    
    # Substituição de e-mails por <EMAIL>
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '<EMAIL>', text)
    
    # Substituição de grandes números ou IDs por <CODE>
    text = re.sub(r'\b\d{4,}\b', '<CODE>', text)

    # Substituição de numeros por <NUMBER>
    text = re.sub(r'\b\d+\b', '<NUMBER>', text)
    
    # Remoção de formatações Markdown (ex: **bold**, _italic_)
    text = re.sub(r'(\*\*|__)(.*?)\1', r'\2', text)  # Remove ** ou __
    text = re.sub(r'(_)(.*?)\1', r'\2', text)  # Remove _
    
    # Remoção de pontuações, preservando tokens como <LINK>
    text = re.sub(r'[^\w\s<>]', '', text)
    
    # Remoção de stopwords
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    text = ' '.join(filtered_words)
    
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\belfo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import json

# Load the previously saved data
with open("raw_issues.json", "r") as file:
    data = json.load(file)

# Process each repository's issues and their comments
refined_data = []
for repo, issues in data.items():
    print(f"Processing {repo}...")
    for issue in issues:
        # Preprocess the issue fields
        refined_issue = {
            "repo": repo,
            "createdAt": issue["createdAt"],
            "title": preprocess_issue(issue["title"]),
            "body": preprocess_issue(issue["body"]),
            "comments": []
        }

        # Preprocess the comments
        for comment in issue.get("comments", []):
            refined_issue["comments"].append({
                "body": preprocess_issue(comment["body"]),
                "createdAt": comment["createdAt"]
            })

        refined_data.append(refined_issue)

# Save the refined data back to JSON, preserving order
with open("refined_issues.json", "w") as file:
    json.dump(refined_data, file, indent=4)

print("Refined data saved to refined_issues.json")


Processing oasisprotocol/oasis-core...
Processing klaytn/klaytn...
Processing harmony-one/harmony...
Processing parallel-finance/parallel...
Processing freeverseio/laos...
Processing nucypher/nucypher...
Processing ethereum/go-ethereum...
Processing witnet/witnet-rust...
Processing sora-xor/sora2-network...
Processing Agoric/agoric-sdk...
Processing ainblockchain/ain-blockchain...
Processing bigbangcore/BigBang...
Processing massalabs/massa...
Processing ton-blockchain/TEPs...
Processing red/red...
Processing cryptoblades/cryptoblades...
Processing Veil-Project/veil...
Processing spartan-protocol/SpartanProtocol-DAppV2...
Processing celo-org/celo-monorepo...
Processing OriginProtocol/origin-dollar...
Processing starcoinorg/starcoin...
Processing KlimaDAO/klimadao...
Processing peercoin/peercoin...
Processing DeFiCh/ain...
Processing open-chat-labs/open-chat...
Processing PIVX-Project/PIVX...
Processing steemit/steem...
Processing omgnetwork/elixir-omg...
Processing trustwallet/assets..