<a href="https://colab.research.google.com/github/WaliMuhammadAhmad/Llama-2-7B-Fine-Tuning-for-Unit-Test-Generation/blob/main/Dataset%5BColab_Version%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install javalang

Collecting javalang
  Downloading javalang-0.13.0-py3-none-any.whl (22 kB)
Installing collected packages: javalang
Successfully installed javalang-0.13.0


In [72]:
import pandas as pd
import subprocess
import javalang
import zipfile
import shutil
import glob
import csv
import sys
import os

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Define variables

In [74]:
start_range = 1
end_range = 100

zip_filename = f'Range_{start_range}_{end_range}.zip'

target_dir = '/content/repos'

In [7]:
repo_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/RepoInfo.csv')
repo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1020 entries, 0 to 1019
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ProjectID  1020 non-null   int64 
 1   Name       1020 non-null   object
 2   FullName   1020 non-null   object
 3   Forks      1020 non-null   int64 
 4   Stars      1020 non-null   int64 
 5   Watchers   1020 non-null   int64 
 6   URL        1020 non-null   object
 7   HTMLURL    1020 non-null   object
dtypes: int64(4), object(4)
memory usage: 63.9+ KB


In [47]:
repo_df.head(start_range)

Unnamed: 0,ProjectID,Name,FullName,Forks,Stars,Watchers,URL,HTMLURL
0,132464395,JavaGuide,Snailclimb/JavaGuide,45411,144730,4522,https://api.github.com/repos/Snailclimb/JavaGuide,https://github.com/Snailclimb/JavaGuide
1,206462776,GitHub-Chinese-Top-Charts,GrowingGit/GitHub-Chinese-Top-Charts,12600,94520,2577,https://api.github.com/repos/GrowingGit/GitHub...,https://github.com/GrowingGit/GitHub-Chinese-T...
2,22790488,java-design-patterns,iluwatar/java-design-patterns,26207,88316,3780,https://api.github.com/repos/iluwatar/java-des...,https://github.com/iluwatar/java-design-patterns
3,561730219,hello-algo,krahets/hello-algo,10168,79878,459,https://api.github.com/repos/krahets/hello-algo,https://github.com/krahets/hello-algo


In [9]:
os.makedirs(target_dir, exist_ok=True)
print(f"{target_dir} created")

/content/repos dir created


### Define Functions

In [42]:
def clone_repos(repo_url, target_dir):

    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    try:
        repo_name = repo_url.split('/')[-1].replace('.git', '')
        repo_path = os.path.join(target_dir, repo_name)

        if os.path.exists(repo_path):
            print(f"Repository {repo_name} already exists. Skipping clone.")
            return repo_path

        print(f"Cloning {repo_url} into {repo_path}...")
        subprocess.run(['git', 'clone', repo_url, repo_path], check=True)
        print(f"Successfully cloned {repo_url}")
    except subprocess.CalledProcessError as e:
        print(f"Failed to clone {repo_url}: {e}")

    return repo_path

In [64]:
def extract_methods_and_tests(repo_dir):

    methods_and_tests = []
    total_methods = 0
    total_test_cases = 0
    sys.setrecursionlimit(1000)

    repo_name = last_name = os.path.basename(repo_dir)

    print(f"Looking into Repository : {repo_name}")

    for root, _, files in os.walk(repo_dir):
        for file in files:
            if file.endswith('.java'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    code = f.read()
                    try:
                        tree = javalang.parse.parse(code)
                        for path, node in tree.filter(javalang.tree.MethodDeclaration):
                            method_name = node.name
                            start_line = node.position.line - 1
                            end_line = start_line
                            if node.body:
                                end_line = max(statement.position.line for statement in node.body if statement.position) - 1

                            # Extracting class declaration
                            class_decl = None
                            for _, n in tree.filter(javalang.tree.ClassDeclaration):
                                if n.position.line <= start_line + 1:
                                    class_decl = n
                                else:
                                    break

                            if class_decl:
                                class_start_line = class_decl.position.line - 1
                                class_end_line = end_line
                                class_code = '\n'.join(code.splitlines()[class_start_line:class_end_line + 1])
                            else:
                                class_code = ''

                            method_code = '\n'.join(code.splitlines()[start_line:end_line + 1])
                            method_body = method_code

                            test_cases = []
                            for _, test_node in tree.filter(javalang.tree.MethodDeclaration):
                                if (method_name.lower() in test_node.name.lower() and
                                    any('assert' in statement if isinstance(statement, javalang.tree.StatementExpression) else False for statement in (test_node.body or []))):

                                    test_start_line = test_node.position.line - 1
                                    test_end_line = test_start_line
                                    if test_node.body:
                                        test_end_line = max(statement.position.line for statement in test_node.body if statement.position) - 1

                                    test_code = '\n'.join(code.splitlines()[test_start_line:test_end_line + 1])
                                    test_cases.append(test_code)

                            if test_cases:
                                methods_and_tests.append({
                                    'method_name': method_name,
                                    'method_full_class': class_code,
                                    'method_body': method_body,
                                    'test_cases': test_cases
                                })
                                total_methods += 1
                                total_test_cases += len(test_cases)
                    except (javalang.parser.JavaSyntaxError, javalang.tokenizer.LexerError) as e:
#                         print(f"Error parsing {file_path}: {e}")
                        continue

    print("<-- Extraction finished -->")
    print(f"Total methods extracted: {total_methods}")
    print(f"Total test cases extracted: {total_test_cases}")

    if not methods_and_tests:
        print(f"No methods or test cases found in the {repo_name}... Moving to next One")
        return

    data = []
    for item in methods_and_tests:
        for test in item['test_cases']:
            data.append({
                'fn_name': item['method_name'],
                'fn_fc': item['method_full_class'],
                'fn': item['method_body'],
                'target': test
            })

    df_methods_and_tests = pd.DataFrame(data)
    df_methods_and_tests.to_csv(f"{repo_name}.csv", index=False, encoding='utf-8-sig', escapechar='\\', errors='ignore')

    print(f"CSV file saved for {repo_name}")
    return methods_and_tests

In [17]:
def zip_csv_files(directory):

    csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]
    if not csv_files:
        print("No CSV files found in the directory.")
        return None

    with zipfile.ZipFile(zip_filename, 'w') as zipf:
        for csv_file in csv_files:
            file_path = os.path.join(directory, csv_file)
            zipf.write(file_path, csv_file)

    print(f"Zip file '{zip_filename}' created successfully.")

In [18]:
def SelectRepo(start,end):
    repo_names = repo_df['Name'].tolist()[start:end]
    repo_urls = repo_df['HTMLURL'].tolist()[start:end]

    return repo_names,repo_urls

In [19]:
def Pipeline(start,end):
    repo_names,repo_urls = SelectRepo(start,end)
    print(f"Selected Repos Name : {repo_names}")

    for repo in repo_urls:
        path = clone_repos(repo,target_dir)
        extract_methods_and_tests(path)
        shutil.rmtree(path)
        print(f"Folder '{path}' deleted.")

    zip_csv_files('/content/')

In [65]:
Pipeline(start_range,end_range)

Selected Repos Name : ['java-design-patterns']
Cloning https://github.com/iluwatar/java-design-patterns into /content/repos/java-design-patterns...
Successfully cloned https://github.com/iluwatar/java-design-patterns
Looking into /content/repos/java-design-patterns ...
Looking into Repository : java-design-patterns
<-- Extraction finished -->
Total methods extracted: 0
Total test cases extracted: 0
No methods or test cases found in the java-design-patterns... Moving to next One
Folder '/content/repos/java-design-patterns' deleted.


In [82]:
zip_exists = any(glob.glob('/content/*.zip'))
print(zip_exists)

False


In [83]:
if zip_exists:
  shutil.move(f'/content/{zip_filename}', '/content/drive/MyDrive/Colab Notebooks/Dataset')
  print(f"{zip_filename} moved to Drive")
else:
  csv_files = [f for f in os.listdir('/content/') if f.endswith('.csv')]
  for csv_file in csv_files:
    shutil.move(f'/content/{csv_file}', '/content/drive/MyDrive/Colab Notebooks/Dataset')
    print(f"{csv_file} moved to Drive")

mall.csv moved to Drive
java-design-patterns.csv moved to Drive


In [86]:
!rmdir /content/repos

In [87]:
from google.colab import drive
drive.flush_and_unmount()

In [84]:
print('Success')

Success
