In [None]:
import pandas as pd

# Load datasets
sc_vuln_8label = pd.read_csv("SC_Vuln_8label.csv")
sc_4label = pd.read_csv("SC_4label.csv")

# Print original dataset sizes
print(f"Original SC_Vuln_8label.csv rows: {len(sc_vuln_8label)}")
print(f"Original SC_4label.csv rows: {len(sc_4label)}")

# Define mapping from SC_4label to SC_Vuln_8label
label_mapping = {
    "/content/drive/My Drive/SC_Dataset/dangerous delegatecall (DE)/": "./Dataset/dangerous delegatecall (DE)/",
    "/content/drive/My Drive/SC_Dataset/integer overflow (OF)/": "./Dataset/integer overflow (OF)/",
    "/content/drive/My Drive/SC_Dataset/reentrancy (RE)/": "./Dataset/reentrancy (RE)/",
    "/content/drive/My Drive/SC_Dataset/timestamp dependency (TP)/": "./Dataset/timestamp dependency (TP)/"
}

label_encoded_mapping = {
    "./Dataset/dangerous delegatecall (DE)/": 1,
    "./Dataset/integer overflow (OF)/": 4,
    "./Dataset/reentrancy (RE)/": 5,
    "./Dataset/timestamp dependency (TP)/": 6
}

# Update labels in SC_4label.csv
sc_4label["label"] = sc_4label["label"].map(label_mapping)
sc_4label["label_encoded"] = sc_4label["label"].map(label_encoded_mapping)

# Merge datasets
merged_df = pd.concat([sc_vuln_8label, sc_4label], ignore_index=True)
print(f"Total rows after merging (before removing duplicates): {len(merged_df)}")

# Find duplicate rows based on filename and label_encoded
duplicates = merged_df[merged_df.duplicated(subset=["filename", "label_encoded"], keep="first")]

# Print and save duplicate rows for manual inspection
print(f"Duplicate rows detected: {len(duplicates)}")
duplicates.to_csv("Duplicate_Rows.csv", index=False)
print("Duplicate rows saved as 'Duplicate_Rows.csv' for manual inspection.")

# Remove duplicates
merged_no_duplicates = merged_df.drop_duplicates(subset=["filename", "label_encoded"], keep="first")

# Print final dataset size
print(f"Total rows after removing duplicates: {len(merged_no_duplicates)}")
print(f"Number of duplicate rows removed: {len(merged_df) - len(merged_no_duplicates)}")

# Save the merged dataset
merged_no_duplicates.to_csv("Merged_SC_Dataset.csv", index=False)

print("Merging complete! Saved as 'Merged_SC_Dataset.csv'.")


In [None]:
import pandas as pd

# Load datasets
sc_vuln_8label = pd.read_csv("SC_Vuln_8label.csv")
sc_4label = pd.read_csv("SC_4label.csv")

# Define mapping from SC_4label to SC_Vuln_8label
label_mapping = {
    "/content/drive/My Drive/SC_Dataset/dangerous delegatecall (DE)/": "./Dataset/dangerous delegatecall (DE)/",
    "/content/drive/My Drive/SC_Dataset/integer overflow (OF)/": "./Dataset/integer overflow (OF)/",
    "/content/drive/My Drive/SC_Dataset/reentrancy (RE)/": "./Dataset/reentrancy (RE)/",
    "/content/drive/My Drive/SC_Dataset/timestamp dependency (TP)/": "./Dataset/timestamp dependency (TP)/"
}

label_encoded_mapping = {
    "./Dataset/dangerous delegatecall (DE)/": 1,
    "./Dataset/integer overflow (OF)/": 4,
    "./Dataset/reentrancy (RE)/": 5,
    "./Dataset/timestamp dependency (TP)/": 6
}

# Update labels in SC_4label.csv
sc_4label["label"] = sc_4label["label"].map(label_mapping)
sc_4label["label_encoded"] = sc_4label["label"].map(label_encoded_mapping)

# Merge datasets
merged_df = pd.concat([sc_vuln_8label, sc_4label], ignore_index=True)

# 找出 filename 和 label_encoded 相同，但 code 可能不同的行
duplicates = merged_df[merged_df.duplicated(subset=["filename", "label_encoded"], keep=False)]

# 按 filename 和 label_encoded 分组，把 code 放到列表
comparison_df = duplicates.groupby(["filename", "label_encoded"])["code"].apply(list).reset_index()

# 确保每个 group 只有两个 code 值，超出部分丢弃，少于两个的填充 NaN
comparison_df["code_1"] = comparison_df["code"].apply(lambda x: x[0] if len(x) > 0 else None)
comparison_df["code_2"] = comparison_df["code"].apply(lambda x: x[1] if len(x) > 1 else None)

# 添加 "match" 列，检查两个 code 是否相同
comparison_df["match"] = comparison_df.apply(lambda row: row["code_1"] == row["code_2"], axis=1)

# 删除原来的 code 列
comparison_df = comparison_df.drop(columns=["code"])

# 只保存前 200 行
comparison_df.head(200).to_csv("Duplicate_Code_Comparison.csv", index=False)

print("Saved first 200 duplicate code comparisons as 'Duplicate_Code_Comparison.csv'.")


In [None]:

# 读取 Duplicate_Rows.csv
duplicates_df = pd.read_csv("Duplicate_Rows.csv")

# 按 filename 排序
duplicates_sorted = duplicates_df.sort_values(by="filename")

# 打印前 100 行
print(duplicates_sorted.head(100))

duplicates_sorted.head(100).to_csv("Top100_Duplicate_Rows.csv", index=False)
print("Saved first 100 duplicate rows as 'Top100_Duplicate_Rows.csv'.")