In [None]:
import os
import shutil
import requests
from github import Github
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
GITHUB_TOKEN = "git_token"
g = Github(GITHUB_TOKEN)

OUTPUT_FOLDER = "data/java_train/"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

QUERY = "language:Java stars:>100"
MAX_FILES = 5000

def download_java_files():
    count = 0
    for repo in g.search_repositories(query=QUERY, sort="stars", order="desc"):
        try:
            contents = repo.get_contents("")
            while contents:
                file_content = contents.pop(0)
                if file_content.type == "dir":
                    contents.extend(repo.get_contents(file_content.path))
                elif file_content.name.endswith(".java"):
                    file_data = requests.get(file_content.download_url).text
                    file_path = os.path.join(OUTPUT_FOLDER, file_content.name)
                    with open(file_path, "w", encoding="utf-8") as f:
                        f.write(file_data)
                    print(f"Downloaded: {file_content.name}")
                    count += 1
                    if count >= MAX_FILES:
                        return
        except Exception as e:
            print(f"Error processing repo {repo.full_name}: {e}")

download_java_files()
print("Download complete.")

In [None]:
input_csv = "class.csv"
output_csv = "class_modified.csv"

df = pd.read_csv(input_csv)
df = df.drop(columns=['file', 'class', 'type'])

df.to_csv(output_csv, index=False)

print(f"Modified CSV saved as {output_csv}")

In [None]:
df = pd.read_csv("class_modified.csv")

print(df.describe())
print(df.isnull().sum())
print(df.columns)

In [None]:
df.hist(figsize=(12, 10), bins=30)
plt.show()

plt.figure(figsize=(12, 6))
sns.boxplot(data=df)
plt.xticks(rotation=90)
plt.show()

In [None]:
print(df[['wmc', 'dit', 'cbo', 'rfc', 'loc', 'fanin', 'fanout', 'assignmentsQty', 'loopQty', 'tryCatchQty']].quantile(0.90))

In [None]:
def label_code_smell(row):
    if (row['wmc'] > 18) or (row['dit'] > 3) or (row['cbo'] > 8) or \
       (row['rfc'] > 15) or (row['loc'] > 120) or (row['fanin'] > 5) or \
       (row['fanout'] > 10) or (row['assignmentsQty'] > 20) or \
       (row['loopQty'] > 3) or (row['tryCatchQty'] > 1):
        return 1
    return 0

In [None]:
df['label'] = df.apply(label_code_smell, axis=1)

dataclass_path = "data/dataset/"
os.makedirs(dataclass_path, exist_ok=True)
df.to_csv(os.path.join(dataclass_path, "dataclass_metrics.csv"), index=False)
print("Dataset Metrics saved")

In [None]:
df = pd.read_csv(input_csv)
df['label'] = df.apply(label_code_smell, axis=1)
df = df[['file', 'class', 'label']]
df.to_csv(os.path.join(dataclass_path, "dataclass_labels.csv"), index=False)
print("Dataset Labels saved")

In [None]:
def delete_file_if_exists(filepath):
    os.path.exists(filepath) and shutil.rmtree(filepath)

delete_file_if_exists('class.csv')
delete_file_if_exists('method.csv')
delete_file_if_exists('class_modified.csv')
print("Deleted ck_output files")