In [21]:
# Cell 1: Import Libraries and Set Up
import os
import json
import re
import numpy as np
import nltk
import enchant
from collections import Counter
from pygments import lex
from pygments.lexers import get_lexer_by_name, find_lexer_class
from pygments.token import Token
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('words')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
english_dict = enchant.Dict("en_US")



[nltk_data] Downloading package stopwords to C:\Users\Vladimir
[nltk_data]     Filipovic\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Vladimir
[nltk_data]     Filipovic\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Vladimir
[nltk_data]     Filipovic\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Vladimir
[nltk_data]     Filipovic\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package words to C:\Users\Vladimir
[nltk_data]     Filipovic\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [63]:
# Cell 2: Define Functions for Comment Extraction and Preprocessing

def extract_comments_from_file(file_path, language):
    comments = []
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        code = file.read()
        if language == "Jupyter Notebook":
            language = "Python"
        lexer = get_lexer_by_name(language)
        tokens = list(lex(code, lexer))
        for token_type, token_value in tokens:
            if token_type in Token.Comment:
                comments.append(token_value.strip())
    return comments

def preprocess_text(text):
    text = re.sub(r'\W+', ' ', text)
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and english_dict.check(token)]
    return ' '.join(tokens)

def get_extensions(language):
    # Define extensions for supported languages
    extensions = {
        "Python": [".py"],
        "JavaScript": [".js"],
        "Java": [".java"],
        "Jupyter Notebook": [".ipynb"],
        "C++": [".cpp", ".h"],
        "R": [".r"],
        "LiveScript": [".ls"],
        "Fortran": [".f", ".f90"],
        "C": [".c", ".h"],
        "C#": [".cs"],
        "Go": [".go"],
        "Ruby": [".rb"],
        "Rust": [".rs"],
        "Swift": [".swift"],
        "Kotlin": [".kt"],
        "Scala": [".scala"],
        "PHP": [".php"],
        "TypeScript": [".ts"],
        "Mathematica": [".nb"],
        "Perl": [".pl"],
        "Haskell": [".hs"],
        "Lua": [".lua"],
        "Julia": [".jl"],
        "Shell": [".sh"],
        "Objective-C": [".m"],
        # Add more languages and their extensions here
    }
    return extensions.get(language, [])

In [64]:
# Cell 3: Process Repository

def process_repository(directory, language, repo_name):
    if not get_extensions(language):
        return {repo_name: "Can't process"}

    all_comments = ''

    for root, _, files in os.walk(directory):
        for file in files:
            if any(file.endswith(ext) for ext in get_extensions(language)):
                file_path = os.path.join(root, file)
                comments = extract_comments_from_file(file_path, language)
                all_comments += ' '.join(comments)
    
    # Preprocess comments and combine them into a single string
    flat_comments =  preprocess_text(all_comments)
    
    return {repo_name: flat_comments}

In [65]:
#Cell 4: Analyze Repositories

def analyze_repositories(base_directory, repo_languages):
    results = {}


    for repo in os.listdir(base_directory):
        repo_path = os.path.join(base_directory, repo)


        if os.path.isdir(repo_path):
            # Log
            print(f"Processing {repo}")
            language = repo_languages.get(repo, "UnsupportedLanguage")
            is_supported = get_extensions(language) != []
            print(f"Language: {language}")
            if not is_supported:
                print(f"Can't process {repo} as it is not a supported language")
                results[repo] = "Can't process"
            else:
                repo_results = process_repository(repo_path, language, repo)
                results.update(repo_results)
    return results


In [66]:
# Cell 5: Main Execution


base_directory = '../data/repos/'

# Load repository languages from JSON file
with open('repo-languages.json', 'r') as f:
    repo_languages = json.load(f)


results = analyze_repositories(base_directory, repo_languages)

with open('repository_comments.json', 'w', encoding='utf-8') as outfile:
    json.dump(results, outfile, ensure_ascii=False, indent=4)

print("Analysis complete. Results saved to repository_comments.json")

Processing 0tt3r
Language: C


Processing 1-part
Language: UnsupportedLanguage
Can't process 1-part as it is not a supported language
Processing 1234zou
Language: Fortran
Processing 3T
Language: Jupyter Notebook
Processing 510004015
Language: JavaScript
Processing 7enTropy7
Language: Jupyter Notebook
Processing 89996462
Language: JavaScript
Processing aa75017730
Language: JavaScript
Processing aarontrowbridge
Language: Unknown
Can't process aarontrowbridge as it is not a supported language
Processing abdullahkhalids
Language: Jupyter Notebook
Processing absolute-quantum
Language: Python
Processing acheong08
Language: Go
Processing AdhesionTek
Language: JavaScript
Processing advancesoftcorp
Language: Fortran
Processing AgnostiqHQ
Language: Python
Processing AI4Finance-Foundation
Language: Python
Processing aiidateam
Language: Python
Processing ajz34
Language: Jupyter Notebook
Processing alejomonbar
Language: Jupyter Notebook
Processing alexandersgreen
Language: Haskell
Processing alibaba
Language: Python
Processing A

In [106]:
# Cell 6: Step 1: Define keywords v1
keywords = {
    "General Quantum Algorithms": ["algorithm", "search", "walk", "Fourier transform", "phase estimation", "hidden subgroup problem", "speedup","optimization", "teleportation"],
    "Grover's Algorithm": ["grover", "amplitude amplification", "search", "algorithm"],
    "Shor's Algorithm": ["shor","algorithm","shor's algorithm","fraction", "fractals", "factorization", "integer factorization", "period finding", "order finding", "discrete logarithm", "RSA", "elliptic curve", "cryptography", "quantum-safe", "quantum-resistant"],
    "Variational Quantum Eigensolver (VQE)": ["variational eigensolver","algorithm", "vqe", "chemistry", "simulation"],
    "Quantum Approximate Optimization Algorithm (QAOA)": ["approximate optimization algorithm", "qaoa", "optimization","algorithm", "maxcut", "traveling salesman", "tsp", "combinatorial optimization", "graph partitioning", "scheduling", "routing"],
    "HHL Algorithm": ["hhl algorithm", "linear systems", "matrix inversion", "algorithm"],
    "Deutsch-Jozsa Algorithm": ["deutsch-jozsa", "balanced function", "constant function","algorithm"],
    "Simon's Algorithm": ["simon", "simon’s algorithm", "periodicity", "hidden period", "algorithm"],
    "General Quantum Cryptography": ["security","cryptography", "encryption", "secure direct communication", "post-cryptography", "network", "repeater", "hacking", "side-channel attack", "bit commitment", "digital signatures", "secret sharing", "secure comunication", "quantum-safe", "quantum-resistant"],
    "Quantum key distirbution ": ["quantum key distirbution", "security", "bb84","b92", "single-photon QKD", "cryptography",  "key distribution", "qkd", "encryption", "Alice", "Bob", "Eve", "e91", "entanglement-based QKD", "entanglement", "ekert protocol",],
    "Quantum Simulation": ["hamiltonian simulation",  "wavefunction", "many-body systems", "phase transition", "dynamics", "Monte Carlo", "lattice models", "fermionic simulation", "bosonic simulation", "chemistry", "variational eigensolver", "vqe", "unitary coupled cluster", "molecular dynamics", "field theory", "digital simulation", "analog simulation", "chaos", "adiabatic simulation", "quantum walk", "quantum cellular automata"],
    "Quantum Hardware": ["superconducting", "trapped ions",  "topological qubit", "decoherence", "error correction", "surface code",  "chip", "Josephson junction", "annealer", "processor", "transducer", "hybrid systems", "memristor"],
    "Quantum Machine Learning": ["neural networks", "qnn", "support vector machine", "qsvm", "clustering", "principal component analysis", "qpca", "Boltzmann machine", "qbms", "generative adversarial networks", "qgans", "hybrid classical machine learning", "data encoding", "feature mapping", "kernel estimation", "unsupervised learning", "reinforcement learning", "autoencoders", "classifier", "regression", "data"],
    "Quantum Communication": [
    "communication protocols", "coherent communication", "modulation",
    "communication complexity", "quantum internet", "quantum repeater",
    "quantum router", "quantum network", "quantum channel", "quantum link",
    "quantum relay", "quantum bridge", "quantum satellite"
],
    "Quantum Computer Tooling and Frameworks": ["development kit", "QDK", "SDK", "IDE", "programming environment", "emulation",  "circuit design", "circuit optimization", "gate library", "error correction tools", "debugging", "visualization tools", "benchmarking", "hardware integration", "cloud service", "API"],
    "Quantum Computer Simulation": ["quantum computing simulator","computer simulation","state vector simulation", "computer emulation", "circuit simulation", "hardware simulation", "system simulation", "software simulation", "virtual computer", "computing simulator", "emulator", "computing emulation", "circuit emulator", "state simulator", "gate simulator", "processor simulation", "algorithm simulation", ],
    "Learning Project": ["tutorial", "example", "learning project", "educational", "beginner", "introduction", "getting started", "hands-on", "course", "workshop", "training", "sample code", "demo", "guide", "instructional", "lesson", "walkthrough", "coding exercises", "teaching", "learning module"]
}





In [107]:
import re
from collections import Counter
from itertools import chain
import jellyfish

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'\W+', ' ', text)
    # Tokenize text
    tokens = text.split()
    return tokens

def generate_ngrams(tokens, n):
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

def extract_ngrams(text):
    tokens = preprocess_text(text)
    unigrams = tokens
    bigrams = generate_ngrams(tokens, 2)
    trigrams = generate_ngrams(tokens, 3)
    return unigrams + bigrams + trigrams

def classify_text(text, keyword_dict):
    ngrams = extract_ngrams(text)
    category_scores = {category: 0 for category in keyword_dict}

    for ngram in ngrams:
        for category, keywords in keyword_dict.items():
            if ngram in keywords:
                category_scores[category] += 1

    # Find the category with the highest score
    best_category = max(category_scores, key=category_scores.get)
    
    # Check if no keywords were matched
    if category_scores[best_category] == 0:
        return "No match found", category_scores
    
    return best_category, category_scores



def aggregate_comments(repo_comments):
    # Combine all comments into a single string
    combined_comments = ' '.join(repo_comments)
    return combined_comments


def classify_text(text, keyword_dict, threshold=0.90):
    ngrams = extract_ngrams(text)
    category_scores = {category: 0 for category in keyword_dict}

    for ngram in ngrams:
        for category, keywords in keyword_dict.items():
            for keyword in keywords:
                similarity = jellyfish.jaro_winkler_similarity(ngram, keyword)
                if similarity >= threshold:
                    category_scores[category] += 1

    # Find the category with the highest score
    best_category = max(category_scores, key=category_scores.get)

    # Check if no keywords were matched
    if category_scores[best_category] == 0:
        return "No match found", category_scores

    return best_category, category_scores



In [108]:
def classify_repositories(repositories, keyword_dict, threshold=0.85):
    repo_classifications = {}
    for repo_name, comments in repositories.items():
        combined_comments = aggregate_comments(comments)
        category_name, scores = classify_text(combined_comments, keyword_dict, threshold)

        repo_classifications[repo_name] = category_name, scores
    return repo_classifications

# Example comments for a repository
repo_comments = "coding 8 0 7 0 0 3 0 0 34 2 amazon 1 18 0 coding 8 0 7 0 0 3 0 0 34 2 amazon 1 18 0 coding 8 set attribute product device set method simulator simulator stabilizer simulator simulator complex imaginary zero maybe coding 8 c library following must equal definition h kind circuit visualization 1 1 1 parameter 2 2 1 parameter measurement coding 8 coding 8 non selective measurement selective measurement mat mat entropy mutual information relative entropy p dot conjugate operate gate operate c library coding 8 stabilizer coding 8 coding 8 self c library coding 8 get stats measurement self change change measurement operate gate operate quantum circuit coding 8 ex x 1 ex x 1 z 0 constant term include product coding 8 c library observable coding 8 coding 8 show method show method append single gate append group 1 gate padding line 0 line 1 line 2 3 line 3 4 header include file definition description gate operation random generation 1 gate 1 1 parameter gate 2 gate 2 1 parameter gate delete extra gate kind must int must list int para none para must list float c none c must int none must int self kind para c check check 1 gate p gate 2 gate non unitary gate operate gate gate c gate operate quantum circuit c library coding 8 1 gate 2 gate 3 gate n gate target register controlled psi psi pi 2 1 unit pi radian operate product reset measure coding 8 none measurement operate gate operate quantum circuit c library coding 8 coding 8 measurement 1st last 1 measurement last measurement operate gate operate quantum circuit c library stabilizer coding 8 number check number check number check number check number check number check number check id number check number check number check number check number check number check number check id 0 number read check section check key raise exception invalid key defined raise exception required key defined coding 8 coding 8 1 0 parameter gate 1 1 parameter gate 2 0 parameter gate 2 1 parameter gate supported unitary gate exist non unitary gate measure rest 1 mean measured 0 measured store c classical register allowed operating unitary gate measured limitation ex 0 1 2 2 1 0 2 1 0 1 1 gate gate redefine measured info ex 0 1 2 2 1 0 2 1 0 1 1 1 2 1 0 1 1 2 0 0 1 1 1 1 none 0 1 2 2 1 0 1 1 marginal frequency measured coding 8 set execute circuit measurement gate included 1 0 parameter gate 1 1 parameter gate 2 1 parameter gate coding 8 coding 8 coding 8 coding 8 1 0 parameter gate 1 1 parameter gate 1 2 parameter gate 1 3 parameter gate 2 0 parameter gate 2 1 parameter gate 2 2 parameter gate 2 3 parameter gate initialize measurement gate measurement gate supported defined gate sign definition rotation gate error check error check error check list binary vector bit integer ex 5 0 2 0 2 5 0 1 2 2 0 1 bit list probability frequency last quantum state coding 8 coding 8"
    

# Example repositories with comments
repositories = {
    "Repo1": ["This code uses Grover's algorithm for database search.", "We implemented the HHL algorithm for solving linear systems.", "The quantum circuit design is optimized for superconducting qubits."],
    "Repo2": ["Quantum key distribution (QKD) ensures secure communication.", "Implemented BB84 protocol for QKD.", "Encryption and secure communication with QKD."],
    "Repo3": ["coding 8 0 7 0 0 3 0 0 34 2 amazon 1 18 0 coding 8 0 7 0 0 3 0 0 34 2 amazon 1 18 0 coding 8 set attribute product device set method simulator simulator stabilizer simulator simulator complex imaginary zero maybe coding 8 c library following must equal definition h kind circuit visualization 1 1 1 parameter 2 2 1 parameter measurement coding 8 coding 8 non selective measurement selective measurement mat mat entropy mutual information relative entropy p dot conjugate operate gate operate c library coding 8 stabilizer coding 8 coding 8 self c library coding 8 get stats measurement self change change measurement operate gate operate quantum circuit coding 8 ex x 1 ex x 1 z 0 constant term include product coding 8 c library observable coding 8 coding 8 show method show method append single gate append group 1 gate padding line 0 line 1 line 2 3 line 3 4 header include file definition description gate operation random generation 1 gate 1 1 parameter gate 2 gate 2 1 parameter gate delete extra gate kind must int must list int para none para must list float c none c must int none must int self kind para c check check 1 gate p gate 2 gate non unitary gate operate gate gate c gate operate quantum circuit c library coding 8 1 gate 2 gate 3 gate n gate target register controlled psi psi pi 2 1 unit pi radian operate product reset measure coding 8 none measurement operate gate operate quantum circuit c library coding 8 coding 8 measurement 1st last 1 measurement last measurement operate gate operate quantum circuit c library stabilizer coding 8 number check number check number check number check number check number check number check id number check number check number check number check number check number check number check id 0 number read check section check key raise exception invalid key defined raise exception required key defined coding 8 coding 8 1 0 parameter gate 1 1 parameter gate 2 0 parameter gate 2 1 parameter gate supported unitary gate exist non unitary gate measure rest 1 mean measured 0 measured store c classical register allowed operating unitary gate measured limitation ex 0 1 2 2 1 0 2 1 0 1 1 gate gate redefine measured info ex 0 1 2 2 1 0 2 1 0 1 1 1 2 1 0 1 1 2 0 0 1 1 1 1 none 0 1 2 2 1 0 1 1 marginal frequency measured coding 8 set execute circuit measurement gate included 1 0 parameter gate 1 1 parameter gate 2 1 parameter gate coding 8 coding 8 coding 8 coding 8 1 0 parameter gate 1 1 parameter gate 1 2 parameter gate 1 3 parameter gate 2 0 parameter gate 2 1 parameter gate 2 2 parameter gate 2 3 parameter gate initialize measurement gate measurement gate supported defined gate sign definition rotation gate error check error check error check list binary vector bit integer ex 5 0 2 0 2 5 0 1 2 2 0 1 bit list probability frequency last quantum state coding 8 coding 8"],
}

# Classify repositories
repo_classifications = classify_repositories(repositories, keywords)
for repo, category in repo_classifications.items():
    print(f"Repository: {repo}, Category: {category}")

Repository: Repo1, Category: ('Quantum Communication', {'General Quantum Algorithms': 14, "Grover's Algorithm": 15, "Shor's Algorithm": 11, 'Variational Quantum Eigensolver (VQE)': 9, 'Quantum Approximate Optimization Algorithm (QAOA)': 11, 'HHL Algorithm': 17, 'Deutsch-Jozsa Algorithm': 9, "Simon's Algorithm": 9, 'General Quantum Cryptography': 2, 'Quantum key distirbution ': 3, 'Quantum Simulation': 2, 'Quantum Hardware': 3, 'Quantum Machine Learning': 2, 'Quantum Communication': 18, 'Quantum Computer Tooling and Frameworks': 4, 'Quantum Computer Simulation': 13, 'Learning Project': 0})
Repository: Repo2, Category: ('Quantum Communication', {'General Quantum Algorithms': 0, "Grover's Algorithm": 0, "Shor's Algorithm": 3, 'Variational Quantum Eigensolver (VQE)': 0, 'Quantum Approximate Optimization Algorithm (QAOA)': 0, 'HHL Algorithm': 0, 'Deutsch-Jozsa Algorithm': 0, "Simon's Algorithm": 0, 'General Quantum Cryptography': 20, 'Quantum key distirbution ': 18, 'Quantum Simulation': 3,