In [1]:
!pip install javalang

Collecting javalang
  Downloading javalang-0.13.0-py3-none-any.whl (22 kB)
Installing collected packages: javalang
Successfully installed javalang-0.13.0


In [2]:
import pandas as pd
import subprocess
import javalang
import zipfile
import shutil
import csv
import os

from urllib.parse import urlparse
from javalang.tree import MethodDeclaration, MethodInvocation

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Define variables

In [4]:
start_range = 500
end_range = 600

# Directory to clone repositories
target_dir = '/content/repos'

repo_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data_info.csv')
repo_df.head()

Unnamed: 0.1,Unnamed: 0,ProjectID,Name,FullName,Forks,Stars,Watchers,URL,HTMLURL
0,0,132464395,JavaGuide,Snailclimb/JavaGuide,45411,144730,4522,https://api.github.com/repos/Snailclimb/JavaGuide,https://github.com/Snailclimb/JavaGuide
1,1,206462776,GitHub-Chinese-Top-Charts,GrowingGit/GitHub-Chinese-Top-Charts,12600,94520,2577,https://api.github.com/repos/GrowingGit/GitHub...,https://github.com/GrowingGit/GitHub-Chinese-T...
2,2,22790488,java-design-patterns,iluwatar/java-design-patterns,26207,88316,3780,https://api.github.com/repos/iluwatar/java-des...,https://github.com/iluwatar/java-design-patterns
3,3,561730219,hello-algo,krahets/hello-algo,10168,79878,459,https://api.github.com/repos/krahets/hello-algo,https://github.com/krahets/hello-algo
4,4,127988011,mall,macrozheng/mall,28527,76479,2232,https://api.github.com/repos/macrozheng/mall,https://github.com/macrozheng/mall


In [5]:
repo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1020 entries, 0 to 1019
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1020 non-null   int64 
 1   ProjectID   1020 non-null   int64 
 2   Name        1020 non-null   object
 3   FullName    1020 non-null   object
 4   Forks       1020 non-null   int64 
 5   Stars       1020 non-null   int64 
 6   Watchers    1020 non-null   int64 
 7   URL         1020 non-null   object
 8   HTMLURL     1020 non-null   object
dtypes: int64(5), object(4)
memory usage: 71.8+ KB


In [6]:
os.makedirs(target_dir, exist_ok=True)
print(f"{target_dir} dir created")

/content/repos dir created


### Define Functions

In [7]:
def clone_repos(repo_url, target_dir):

    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    try:
        repo_name = repo_url.split('/')[-1].replace('.git', '')
        repo_path = os.path.join(target_dir, repo_name)

        if os.path.exists(repo_path):
            print(f"Repository {repo_name} already exists. Skipping clone.")
            return repo_path

        print(f"Cloning {repo_url} into {repo_path}...")
        subprocess.run(['git', 'clone', repo_url, repo_path], check=True)
        print(f"Successfully cloned {repo_url}")
    except subprocess.CalledProcessError as e:
        print(f"Failed to clone {repo_url}: {e}")

    return repo_path

In [8]:
def extract_repo_name(repo_url):
    parsed_url = urlparse(repo_url)
    path_parts = parsed_url.path.strip('/').split('/')
    repo_name = path_parts[-1] if path_parts else None
    return repo_name

In [9]:
def extract_methods_and_tests(repo_dir):
    methods_and_tests = []
    total_methods = 0
    total_test_cases = 0

    repo_name = extract_repo_name(repo_dir)
    print(f"Repository: {repo_name}")

    for root, _, files in os.walk(repo_dir):
        for file in files:
            if file.endswith('.java'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    code = f.read()
                    try:
                        tree = javalang.parse.parse(code)
                        for path, node in tree.filter(javalang.tree.MethodDeclaration):
                            method_name = node.name
                            start_line = node.position.line - 1

                            if node.body:
                                end_line = max(statement.position.line for statement in node.body if statement.position) - 1
                            else:
                                end_line = start_line

                            method_code = '\n'.join(code.splitlines()[start_line:end_line + 1])

                            test_cases = []
                            for _, test_node in tree.filter(javalang.tree.MethodDeclaration):
                                if ('test' in test_node.name.lower() or
                                    any('assert' in statement if isinstance(statement, javalang.tree.StatementExpression) else False for statement in (test_node.body or [])) or
                                    any('@Test' in annotation.name for annotation in (test_node.annotations or []))):

                                    test_start_line = test_node.position.line - 1
                                    if test_node.body:
                                        test_end_line = max(statement.position.line for statement in test_node.body if statement.position) - 1
                                    else:
                                        test_end_line = test_start_line

                                    test_code = '\n'.join(code.splitlines()[test_start_line:test_end_line + 1])
                                    test_cases.append({
                                        'test_name': test_node.name,
                                        'test_code': test_code
                                    })

                            methods_and_tests.append({
                                'method_name': method_name,
                                'method_code': method_code,
                                'test_cases': test_cases
                            })
                            total_methods += 1
                            total_test_cases += len(test_cases)
                    except (javalang.parser.JavaSyntaxError, javalang.tokenizer.LexerError) as e:
#                         print(f"Error parsing {file_path}: {e}")
                        continue

    print(f"Extraction finished from {repo_name}")
    print(f"Total methods extracted: {total_methods}")
    print(f"Total test cases extracted: {total_test_cases}")

    data = []
    for item in methods_and_tests:
        for test in item['test_cases']:
            data.append({
                'method_name': item['method_name'],
                'method_code': item['method_code'],
                'test_name': test['test_name'],
                'test_code': test['test_code']
            })

    df_methods_and_tests = pd.DataFrame(data)
    df_methods_and_tests.to_csv(f"{repo_name}.csv", index=False, encoding='utf-8-sig', escapechar='\\', errors='ignore')

    print(f"CSV file saved for {repo_name}")
    return methods_and_tests

In [10]:
def zip_csv_files(directory):

    csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]
    if not csv_files:
        print("No CSV files found in the directory.")
        return None

    zip_filename = f'Range_{start_range}_{end_range}.zip'
    with zipfile.ZipFile(zip_filename, 'w') as zipf:
        for csv_file in csv_files:
            file_path = os.path.join(directory, csv_file)
            zipf.write(file_path, csv_file)

    print(f"Zip file '{zip_filename}' created successfully.")

In [11]:
def SelectRepo(start,end):
    repo_names = repo_df['Name'].tolist()[start:end]
    repo_urls = repo_df['HTMLURL'].tolist()[start:end]

    return repo_names,repo_urls

In [12]:
def Pipeline(start,end):
    repo_names,repo_urls = SelectRepo(start,end)
    print(f"Selected Repos Name : {repo_names}")

    for repo in repo_urls:
        path = clone_repos(repo,target_dir)
        print(f"Looking into {path} ...")
        extract_methods_and_tests(path)
        shutil.rmtree(path)
        print(f"Folder '{path}' deleted.")

In [None]:
Pipeline(start_range,end_range)

Selected Repos Name : ['truth', 'Create', 'proguard', 'cucumber-jvm', 'commons-lang', 'classgraph', 'cruise-control', 'Omni-Notes', 'PlayEdu', 'HackerRank_solutions', 'configuration-as-code-plugin', 'SikuliX1', 'easy_javadoc', 'spring-batch', 'TTS', 'javacpp-presets', 'JavaVerbalExpressions', 'code-examples', 'HaE', 'mongo-java-driver', 'netbeans', 'commafeed', 'incubator-hugegraph', 'assertj', 'cordova-plugin-local-notifications', 'BigDataGuide', 'teavm', 'TIMSDK', 'reactor-netty', 'sonic-server', 'pdfbox', 'spring-ai', 'webdrivermanager', 'symphony', 'android-chat', 'parquet-java', 'material-menu', 'lucene', 'erupt', 'rest.li', 'dependency-track', 'http-kit', 'Lealone', 'jimfs', 'logstash-logback-encoder', 'connectbot', 'freeplane', 'sofa-bolt', 'android-samples', 'pac4j', 'alldata', 'eclipse-collections', 'Smack', 'material-ripple', 'jcasbin', 'brave', 'jeromq', 'LakeSoul', 'xpipe', 'hypersistence-utils', 'java-sec-code', 'kafka-streams-examples', 'autopsy', 'spring-integration-samp

In [None]:
zip_csv_files('/content/')

In [None]:
shutil.move('/content/csv_files.zip', '/content/drive/MyDrive/Colab Notebooks')

In [None]:
print('Success')