In [23]:
# Cell 1: Import Libraries and Set Up
import os
import json
import re
import numpy as np
import nltk
import enchant
from collections import Counter
from pygments import lex
from pygments.lexers import get_lexer_by_name, find_lexer_class
from pygments.token import Token
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('words')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
english_dict = enchant.Dict("en_US")



[nltk_data] Downloading package stopwords to C:\Users\Vladimir
[nltk_data]     Filipovic\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Vladimir
[nltk_data]     Filipovic\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Vladimir
[nltk_data]     Filipovic\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Vladimir
[nltk_data]     Filipovic\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package words to C:\Users\Vladimir
[nltk_data]     Filipovic\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [3]:
# Cell 2: Define Functions for Comment Extraction and Preprocessing

def extract_comments_from_file(file_path, language):
    comments = []
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        code = file.read()
        if language == "Jupyter Notebook":
            language = "Python"
        lexer = get_lexer_by_name(language)
        tokens = lex(code, lexer)
        for token_type, token_value in tokens:
            if token_type in Token.Comment:
                comments.append(token_value.strip())
    return comments

def preprocess_text(text):
    text = re.sub(r'\W+', ' ', text)
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and english_dict.check(token)]
    return ' '.join(tokens)

def get_extensions(language):
    # Define extensions for supported languages
    extensions = {
        "Python": [".py"],
        "JavaScript": [".js"],
        "Java": [".java"],
        "Jupyter Notebook": [".ipynb"],
        "C++": [".cpp", ".h"],
        "R": [".r"],
        "LiveScript": [".ls"],
        "Fortran": [".f", ".f90"],
        "C": [".c", ".h"],
        "C#": [".cs"],
        "Go": [".go"],
        "Ruby": [".rb"],
        "Rust": [".rs"],
        "Swift": [".swift"],
        "Kotlin": [".kt"],
        "Scala": [".scala"],
        "PHP": [".php"],
        "TypeScript": [".ts"],
        "Mathematica": [".nb"],
        "Perl": [".pl"],
        "Haskell": [".hs"],
        "Lua": [".lua"],
        "Julia": [".jl"],
        "Shell": [".sh"],
        "Objective-C": [".m"],
        # Add more languages and their extensions here
    }
    return extensions.get(language, [])

In [22]:
# Cell 3: Process Repository
def process_repository(directory, language, repo_name):
    if not get_extensions(language):
        return {repo_name: "Can't process"}

    all_comments = []
    for root, _, files in os.walk(directory):
        for file in files:
            if any(file.endswith(ext) for ext in get_extensions(language)):
                file_path = os.path.join(root, file)
                comments = extract_comments_from_file(file_path, language)
                all_comments.extend(comments)
    
    processed_comments = [preprocess_text(comment) for comment in all_comments]
    # Return processed comments as string separated by spaces
    return {repo_name: " ".join([" ".join(comment) for comment in processed_comments])}



In [20]:
#Cell 4: Analyze Repositories

def analyze_repositories(base_directory, repo_languages):
    results = {}
    for repo in os.listdir(base_directory):
        repo_path = os.path.join(base_directory, repo)

        if os.path.isdir(repo_path):
            # Log
            print(f"Processing {repo}")
            language = repo_languages.get(repo, "UnsupportedLanguage")
            is_supported = get_extensions(language) != []
            print(f"Language: {language}")
            print(f"Supported: {is_supported}")
            if not is_supported:
                print(f"Can't process {repo} as it is not a supported language")
                results[repo] = "Can't process"
            else:
                print(f"Processing {repo}")
                repo_results = process_repository(repo_path, language, repo)
                results.update(repo_results)
    return results


In [21]:
# Cell 5: Main Execution


base_directory = '../data/repos/'

# Load repository languages from JSON file
with open('repo-languages.json', 'r') as f:
    repo_languages = json.load(f)

results = analyze_repositories(base_directory, repo_languages)

with open('repository_comments.json', 'w', encoding='utf-8') as outfile:
    json.dump(results, outfile, ensure_ascii=False, indent=4)

print("Analysis complete. Results saved to repository_comments.json")

Processing 0tt3r
Language: C
Supported: True
Processing 0tt3r
Processing 1-part
Language: UnsupportedLanguage
Supported: False
Can't process 1-part as it is not a supported language
Processing 1234zou
Language: Fortran
Supported: True
Processing 1234zou
Processing 3T
Language: Jupyter Notebook
Supported: True
Processing 3T
Processing 510004015
Language: JavaScript
Supported: True
Processing 510004015
Processing 7enTropy7
Language: Jupyter Notebook
Supported: True
Processing 7enTropy7
Processing 89996462
Language: JavaScript
Supported: True
Processing 89996462
Processing aa75017730
Language: JavaScript
Supported: True
Processing aa75017730
Processing aarontrowbridge
Language: Unknown
Supported: False
Can't process aarontrowbridge as it is not a supported language
Processing abdullahkhalids
Language: Jupyter Notebook
Supported: True
Processing abdullahkhalids
Processing absolute-quantum
Language: Python
Supported: True
Processing absolute-quantum
Processing acheong08
Language: Go
Support

KeyboardInterrupt: 