DataSet Preparation from GPTCloneBench dataset

In [None]:
import os
from pathlib import Path
import re
import pandas as pd
import numpy as np

In [None]:
# Split file
def split_file(content):
    clone1,_, clone2 = content.partition("\n\n\n")
    clone1 = clone1.strip().strip("\n")
    clone2 = clone2.strip().strip("\n")
    return clone1, clone2

In [None]:
# Load the files from the dataset
def load_clone_files(root_dir, languages):
    base_path = Path(root_dir) / "standalone"

    if not base_path.exists():
        raise FileNotFoundError(f"Base directory not found: {base_path}")
    else:
        print("Base Path:", base_path.absolute())

    file_pattern = re.compile(
    r'.*(true_semantic_clones|false_semantic_clones)[\\/]'
    r'(?P<language>py|java|cs)[\\/]'
    r'(?:prompt_(?P<prompt>\d+)[\\/])?'
    r'(?:(?P<clonetype>MT3|T4|T1|T2)[\\/])?'
    r'(?:Clone_|Gpt_false_pair_)(?P<id>\d+)\.(py|java|cs)$',
    re.IGNORECASE
)

    clones = []
    processed_files = 0

    for clone_type in ['true_semantic_clones', 'false_semantic_clones']:
        type_path = base_path / clone_type

        if not type_path.exists():
            print(f"Directory not found - {type_path}")
            continue
        else:
            print(f"\nProcessing {clone_type} at {type_path}")

        for lang in languages:
            lang_path = type_path / lang

            if not lang_path.exists():
                print(f"  Directory not found - {lang_path}")
                continue
            else:
                print(f"  Processing {lang} files at {lang_path}")

            for file in lang_path.glob("**/*.*"):
                file_str = str(file)
                print(f"    Processing: {file_str}")

                if (lang == 'py' and not file_str.lower().endswith('.py')) or \
                    (lang == 'java' and not file_str.lower().endswith('.java')) or \
                    (lang == 'cs' and not file_str.lower().endswith('.cs')):
                    print(f"    Skipping non-{lang} file")
                    continue

                match = file_pattern.search(file_str)
                if not match:
                    print(f"    File path doesn't match expected pattern")
                    continue

                groups = match.groupdict()
                print(f"    Matched groups: {groups}")

                try:
                    with open(file, 'r', encoding='utf-8') as f:
                        content = f.read()
                except Exception as e:
                    print(f"    Failed to read file: {str(e)}")
                    continue

                clone1, clone2 = split_file(content)

                if not clone1 or not clone2:
                    print(f"Could not split {lang} content into two clones")
                    continue

                record = {
                    'file_path': file_str,
                    'language': lang.lower(),
                    'clone_category': clone_type.lower(),
                    'clone1': clone1,
                    'clone2': clone2,
                    'prompt': groups['prompt'],
                    'clonetype': groups['clonetype'],
                    # 'id': groups['id']
                }
                clones.append(record)
                processed_files += 1
                print(f"    File {processed_files} processed successfully!")

    if not clones:
        raise ValueError("No valid clone files were processed")

    print("Clones", len(clones))
    df = pd.DataFrame(clones)
    # df.sort_values(by='id', inplace=True)
    print(f"\nSuccessfully processed {len(df)} clone pairs")
    return df

In [None]:
dataset_path = r"/content/drive/MyDrive/SEM3_DISSERTATION"

print("Loading clone files...")
#Added cs lan
clones_df = load_clone_files(dataset_path, languages=('py', 'java', 'cs'))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    Matched groups: {'language': 'cs', 'prompt': None, 'clonetype': None, 'id': '2426'}
    File 42310 processed successfully!
    Processing: /content/drive/MyDrive/SEM3_DISSERTATION/standalone/false_semantic_clones/cs/Gpt_false_pair_4462.cs
    Matched groups: {'language': 'cs', 'prompt': None, 'clonetype': None, 'id': '4462'}
    File 42311 processed successfully!
    Processing: /content/drive/MyDrive/SEM3_DISSERTATION/standalone/false_semantic_clones/cs/Gpt_false_pair_4173.cs
    Matched groups: {'language': 'cs', 'prompt': None, 'clonetype': None, 'id': '4173'}
    File 42312 processed successfully!
    Processing: /content/drive/MyDrive/SEM3_DISSERTATION/standalone/false_semantic_clones/cs/Gpt_false_pair_1917.cs
    Matched groups: {'language': 'cs', 'prompt': None, 'clonetype': None, 'id': '1917'}
    File 42313 processed successfully!
    Processing: /content/drive/MyDrive/SEM3_DISSERTATION/standalone/false_seman

In [None]:
clones_df.head(1)

Unnamed: 0,file_path,language,clone_category,clone1,clone2,prompt,clonetype
0,/content/drive/MyDrive/SEM3_DISSERTATION/stand...,py,true_semantic_clones,"def __getitem__(self, key) :\n\tif isinstance(...","def __getitem__(self, key) :\n\tif isinstance(...",2,MT3


Balance the dataset

In [None]:
def balance_clone_data(df):
    # Separate the datasets
    true_df = df[df['clone_category'] == 'true_semantic_clones'].copy()
    false_df = df[df['clone_category'] == 'false_semantic_clones'].copy()

    # Process false semantic clones (balance by language only)
    if len(false_df) > 0:
        false_groups = false_df.groupby('language')
        false_min_size = false_groups.size().min()
        print(f"Balancing false semantic clones to {false_min_size} per language")

        balanced_false = []
        for lang, group in false_groups:
            if len(group) < false_min_size:
                print(f"Warning: Language {lang} has only {len(group)} false samples")
                sampled = group
            else:
                sampled = group.sample(false_min_size, random_state=42)
            balanced_false.append(sampled)
    else:
        print("No false semantic clones found")
        balanced_false = []

    # Process true semantic clones (balance by prompt+clonetype across languages)
    if len(true_df) > 0:
        # First group by prompt and clonetype
        prompt_clone_groups = true_df.groupby(['prompt', 'clonetype'])

        balanced_true = []
        for (prompt, clonetype), group in prompt_clone_groups:
            # Then group by language within each prompt+clonetype group
            lang_groups = group.groupby('language')
            min_size = lang_groups.size().min()

            print(f"Balancing (prompt={prompt}, clonetype={clonetype}) to {min_size} per language")

            for lang, lang_group in lang_groups:
                if len(lang_group) < min_size:
                    print(f"Warning: Language {lang} has only {len(lang_group)} samples for (prompt={prompt}, clonetype={clonetype})")
                    sampled = lang_group
                else:
                    sampled = lang_group.sample(min_size, random_state=42)
                balanced_true.append(sampled)
    else:
        print("No true semantic clones found")
        balanced_true = []

    # Combine results
    balanced_df = pd.concat(balanced_true + balanced_false).reset_index(drop=True)

    # Verification
    print("\nFinal Distribution:")
    if len(balanced_true) > 0:
        print("True semantic clones:")
        print(balanced_df[balanced_df['clone_category'] == 'true_semantic_clones']
              .groupby(['language', 'prompt', 'clonetype']).size())
    if len(balanced_false) > 0:
        print("\nFalse semantic clones:")
        print(balanced_df[balanced_df['clone_category'] == 'false_semantic_clones']
              .groupby('language').size())

    return balanced_df

In [None]:
balanced_df = balance_clone_data(clones_df)

Balancing false semantic clones to 4231 per language
Balancing (prompt=1, clonetype=MT3) to 721 per language
Balancing (prompt=1, clonetype=T4) to 310 per language
Balancing (prompt=2, clonetype=MT3) to 2389 per language
Balancing (prompt=2, clonetype=T4) to 5196 per language

Final Distribution:
True semantic clones:
language  prompt  clonetype
cs        1       MT3           721
                  T4            310
          2       MT3          2389
                  T4           5196
java      1       MT3           721
                  T4            310
          2       MT3          2389
                  T4           5196
py        1       MT3           721
                  T4            310
          2       MT3          2389
                  T4           5196
dtype: int64

False semantic clones:
language
cs      4231
java    4231
py      4231
dtype: int64


In [None]:
python_count = balanced_df[balanced_df['language'] == 'py'].shape[0]
java_count = balanced_df[balanced_df['language'] == 'java'].shape[0]
# Added cs lan
cs_count = balanced_df[balanced_df['language'] == 'cs'].shape[0]

print("Python entries:", python_count)
print("Java entries:", java_count)
# Added cs lan
print("C# entries:", cs_count)

Python entries: 12847
Java entries: 12847
C# entries: 12847


In [None]:
pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m60.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages:

Non-Clone Pair Creation using Normalized Levenshtein Distance

In [None]:
import Levenshtein

# Normalized Levenshtein distance ([0,1])
def normalized_levenshtein(s1, s2):
    distance = Levenshtein.distance(s1, s2)
    max_len = max(len(s1), len(s2))
    return distance / max_len if max_len > 0 else 0
# Create Dissimilar Pairs
def create_dissimilar_pairs(df, similarity_threshold=0.5):
    dissimilar_pairs = []

# Added cs lan
    for lang in ['py', 'java', 'cs']:
        print(f"\nCreating dissimilar pairs for {lang}")
        lang_df = df[df['language'] == lang]

        print("Initial:", lang, len(lang_df))
        if lang_df.empty:
            print(f"No data for {lang}, skipping dissimilar pair creation")
            continue

        num_pairs = len(lang_df)
        pairs = []
        used_pairs = set()
        potential_clones = 0

        for i in range(num_pairs):
            j = num_pairs - 1 - i
            if i == j:
                continue

            clone1 = lang_df.iloc[i]['clone1']
            clone2 = lang_df.iloc[j]['clone2']
            file1 = lang_df.iloc[i]['file_path']
            file2 = lang_df.iloc[j]['file_path']

            if file1 != file2:
                pair_key = tuple(sorted([clone1, clone2]))
                if pair_key not in used_pairs:
                    distance = normalized_levenshtein(clone1, clone2)
                    # is_clone = 0 if distance > similarity_threshold else 1
                    # if is_clone == 1:
                    #     potential_clones += 1

                    # pairs.append({
                    #     'clone1': clone1,
                    #     'clone2': clone2,
                    #     'language': lang,
                    #     'is_semantic_clone': is_clone,
                    #     'levenshtein_distance': distance
                    # })
                    # used_pairs.add(pair_key)
                    if distance > similarity_threshold:
                        pairs.append({
                            'clone1': clone1,
                            'clone2': clone2,
                            'language': lang,
                            'is_semantic_clone': 0,
                            'levenshtein_distance': distance
                        })
                        used_pairs.add(pair_key)

        pairs_df = pd.DataFrame(pairs)

        if pairs_df.empty:
            print(f"  {lang}: No dissimilar pairs generated")
            continue

        pairs_df = pairs_df.drop_duplicates(subset=['clone1', 'clone2'])
        print(f"  {lang}: Generated {len(pairs_df)} pairs")
        print(f"  - Confirmed dissimilar (0): {len(pairs_df) - potential_clones}")
        # print(f"  - Potential clones (1): {potential_clones}")
        print(f"  - Average Levenshtein distance: {pairs_df['levenshtein_distance'].mean():.2f}")

        dissimilar_pairs.append(pairs_df)

    if not dissimilar_pairs:
        print("No dissimilar pairs generated")
        return pd.DataFrame(columns=['language', 'clone1', 'clone2', 'is_semantic_clone', 'levenshtein_distance'])

    return pd.concat(dissimilar_pairs, ignore_index=True)

In [None]:
dissimilar_pairs = create_dissimilar_pairs(balanced_df)


Creating dissimilar pairs for py
Initial: py 12847
  py: Generated 12830 pairs
  - Confirmed dissimilar (0): 12830
  - Average Levenshtein distance: 0.80

Creating dissimilar pairs for java
Initial: java 12847
  java: Generated 12787 pairs
  - Confirmed dissimilar (0): 12787
  - Average Levenshtein distance: 0.75

Creating dissimilar pairs for cs
Initial: cs 12847
  cs: Generated 12810 pairs
  - Confirmed dissimilar (0): 12810
  - Average Levenshtein distance: 0.76


Combine Clone + Non-Clone Pairs

In [None]:
def combine_dataset(df, dissimilar_pairs):
    if df.empty and dissimilar_pairs.empty:
        raise ValueError("Both input DataFrames are empty")

    df['is_semantic_clone'] = 1

    original_df = df[['language', 'clone1', 'clone2', 'is_semantic_clone']]
    original_df.columns = ['language', 'clone1', 'clone2', 'is_semantic_clone']

    dissimilar_df = dissimilar_pairs[['language', 'clone1', 'clone2', 'is_semantic_clone']]
    dissimilar_df.columns = ['language', 'clone1', 'clone2', 'is_semantic_clone']

    final_df = pd.concat([original_df, dissimilar_df], ignore_index=True)

    print(f"Final dataset size: {len(final_df)} pairs")
    print(f"  Original pairs: {len(original_df)}")
    print(f"  Dissimilar pairs: {len(dissimilar_pairs)}")

    return final_df

In [None]:
print("\nCreating final dataset...")
final_df = combine_dataset(balanced_df, dissimilar_pairs)


Creating final dataset...
Final dataset size: 76968 pairs
  Original pairs: 38541
  Dissimilar pairs: 38427


In [None]:
# Save results

# Define the output path
output_path = "/content/drive/MyDrive/SEM3_DISSERTATION/output/Updated/clone_dataset.csv"

# Extract the directory path from the output_path
output_dir = Path(output_path).parent

# Create the directory if it doesn't exist, including any necessary parent directories
os.makedirs(output_dir, exist_ok=True)

# Now save the DataFrame to the file
final_df.to_csv(output_path, index=False)
print(f"\nDataset successfully saved to {output_path}")

# Show sample
print("\nSample of the dataset:")
print(final_df.head())

# Show label distribution
print("\nLabel distribution:")
print(final_df['is_semantic_clone'].value_counts())


Dataset successfully saved to /content/drive/MyDrive/SEM3_DISSERTATION/output/Updated/clone_dataset.csv

Sample of the dataset:
  language                                             clone1  \
0       cs  public void doWork () {\n    int h = 0;\n    d...   
1       cs  private static bool GetFileNameFromHandle (Int...   
2       cs  private void button6_Click (object sender, Eve...   
3       cs  public static string Generate () {\n    var pw...   
4       cs  public override int Read (char [] buffer, int ...   

                                              clone2  is_semantic_clone  
0  public void doWork()\n{\n    int h = 0;\n    w...                  1  
1  private static bool GetFileNameFromHandle (Int...                  1  
2  private void button6_Click (object sender, Eve...                  1  
3  public static string Generate () {\n    var pw...                  1  
4  public override int Read(char [] buffer, int i...                  1  

Label distribution:
is_semantic_clo