In [None]:
from google.colab import drive
drive.mount('/content/drive')

import torch
from tqdm.auto import trange

import numpy as np
import pandas as pd
import os
import torch
from numpy import float64
import json
from tqdm.auto import trange
import sys


Mounted at /content/drive


In [None]:
def sammon_error(x, y, smooth=1e-8, batched_input=False, batch_size=None, d_x=None):
    '''
    :param x:       Original data -> Tensor[#n classes/functions, #k features]
    :param y:       Input data  -> Tensor[#n classes/functions, #l features]
    :param device:  Device cpu or cuda
    :param smooth:  Smooth to avoid div by 0
    :param d_x:     If we have precalculated d_x with shape Tensor[#n classes/functions, #n classes/functions]
    '''

    if not batched_input:

        d_y = torch.cdist(y, y)
        if d_x is None:
            d_x = torch.cdist(x, x)

        return 1 / torch.sum(d_x) * torch.sum(torch.square(d_x - d_y) / (d_x + smooth))
    else:

        total_scale = 0
        total_error = 0
        for dim1_left_index in range(0, x.shape[0], batch_size):
            for dim2_left_index in range(0, x.shape[0], batch_size):
                dim1_right_index = min(x.shape[0], dim1_left_index + batch_size)
                dim2_right_index = min(x.shape[0], dim2_left_index + batch_size)

                d_x_batch = torch.cdist(x[dim1_left_index:dim1_right_index, :],
                                        x[dim2_left_index:dim2_right_index, :])

                d_y_batch = torch.cdist(y[dim1_left_index:dim1_right_index, :],
                                        y[dim2_left_index:dim2_right_index, :])

                total_scale += torch.sum(d_x_batch).long()
                total_error += torch.sum(torch.square(d_x_batch - d_y_batch) / (d_x_batch + smooth))

        return 1 / total_scale * total_error



In [None]:
class FireflyAlgorithm:
    def __init__(self, n_fireflies, n_features, n_selected_features, alpha=1.0, beta=2.0, gamma=1.0,
                 max_iter=100, device='cuda'):
        self.n_fireflies = n_fireflies
        self.n_features = n_features
        self.n_selected_features = n_selected_features
        self.alpha = alpha  # Коэффициент случайности
        self.beta = beta  # Коэффициент притяжения
        self.gamma = gamma  # Коэффициент затухания
        self.max_iter = max_iter
        self.device = device
        self.pheromone = torch.rand(n_fireflies, n_features, device=device)  # Позиции светлячков
        self.best_solution = None
        self.best_score = float('inf')

    def run(self, data, batch_size=None, use_tqdm=True):
        data = data.to(self.device)
        iterator = range(self.max_iter)
        if use_tqdm:
            iterator = trange(self.max_iter, leave=False)

        for iteration in iterator:
            scores = []
            for i in range(self.n_fireflies):
                solution = self.pheromone[i]
                selected_features = self.construct_solution(solution)
                score = self.evaluate_solution(selected_features, data, batch_size)
                scores.append(score)

                # Обновляем лучшую найденную позицию
                if score < self.best_score:
                    self.best_score = score
                    self.best_solution = selected_features

            scores = torch.tensor(scores, device=self.device)
            self.update_fireflies(scores)

        return self.best_solution, self.best_score

    def construct_solution(self, firefly_position):
        """
        Выбираем топ N признаков на основе текущей позиции светлячка.
        """
        _, indices = torch.topk(firefly_position, self.n_selected_features)
        return indices

    def evaluate_solution(self, selected_features, data, batch_size):
        """
        Оценка ошибки Саммона для выбранных признаков.
        """
        data_selected = data[:, selected_features]
        error = sammon_error(data_selected, data, batched_input=batch_size is not None, batch_size=batch_size)
        return error

    def update_fireflies(self, scores):
        """
        Обновление позиций светлячков на основе их притяжения.
        """
        for i in range(self.n_fireflies):
            for j in range(self.n_fireflies):
                if scores[j] < scores[i]:  # Светлячок j лучше, чем i
                    distance = torch.norm(self.pheromone[i] - self.pheromone[j])
                    attraction = self.beta * torch.exp(-self.gamma * distance ** 2)
                    random_component = self.alpha * (torch.rand_like(self.pheromone[i]) - 0.5)

                    self.pheromone[i] += attraction * (self.pheromone[j] - self.pheromone[i]) + random_component

        # Ограничиваем позиции светлячков в пределах [0, 1]
        self.pheromone = torch.clamp(self.pheromone, 0.0, 1.0)


In [None]:
# Define metrics for classes and methods
class_metrics = [
    'CBO', 'CLOC', 'DIT', 'LCOM5', 'LLOC', 'LOC', 'NLM', 'NM', 'NOA', 'NOC',
    'NOD', 'NOI', 'NOP', 'NOS', 'RFC', 'TLLOC', 'TLOC', 'TNLM', 'TNM',
    'TNOS', 'WMC'
]

method_metrics = [
    'CLOC', 'DLOC', 'LLOC', 'LOC', 'McCC', 'NOI', 'NOS', 'NUMPAR', 'TCLOC',
    'TLLOC', 'TLOC', 'TNOS'
]

# Constants for Firefly Algorithm
FIREFLY_COUNTS = 30
FIREFLY_ITERATIONS = 30

# Paths for data and results
data_dir = '/content/drive/My Drive/Statistics/Metrics'  # Путь к данным на Google Drive
results_dir = '/content/drive/My Drive/Statistics/Results'  # Папка для сохранения результатов

# Ensure results directory exists
os.makedirs(results_dir, exist_ok=True)

useCuda = sys.argv[1] == "True"

device = torch.device('cuda')

if useCuda:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

methods_dict = {}
class_dict = {}

# Walk through the data directory
for address, dirs, files in os.walk(data_dir):
    for name in files:
        path = os.path.join(address, name)
        is_class_file = path.endswith("-Class.csv")
        key = address.replace(data_dir, "").strip(os.sep)  # Убираем базовый путь
        if is_class_file and os.path.exists(f"{results_dir}/{key}.json"):
            print("Skip: " + key + " CL: ", is_class_file)
            continue
        if not is_class_file and os.path.exists(f"{results_dir}/{key}_m.json"):
            print("Skip: " + key + " CL: ", is_class_file)
            continue

        print(f"Processing: {key}. Is Class: {is_class_file}")

        class_dict[key] = []
        methods_dict[key] = []
        df = pd.read_csv(path)

        if is_class_file:
            df = df[class_metrics]
            data = torch.from_numpy(df.to_numpy(dtype=float64)).to(device)
            n_features = data.shape[1]

            for i in trange(2, n_features):
                firefly = FireflyAlgorithm(
                    n_fireflies=FIREFLY_COUNTS,
                    n_features=n_features,
                    n_selected_features=i,
                    max_iter=FIREFLY_ITERATIONS,
                    device=device
                )
                selected_metrics, cost = firefly.run(data, use_tqdm=True, batch_size=9000)
                #print(selected_metrics)
                #print(cost)
                #print(key)
                # Преобразуем индексы в названия метрик
                selected_metric_names = df.columns[selected_metrics.cpu().numpy()].tolist()
                class_dict[key].append({
                    "selected_metrics": selected_metric_names,
                    "sammon_error": float(cost.cpu())
                })

            with open(f"{results_dir}/{key}.json", "w") as f_metrics:
                json.dump(class_dict, f_metrics, indent=4)
        else:
            df = df[method_metrics]
            np_data = df.to_numpy(dtype=float64)
            np_data[numpy.isnan(np_data)] = 0
            np_data[numpy.isinf(np_data)] = 0
            np_data[np_data < 0] = 0
            data = torch.from_numpy(np_data).to(device)
            n_features = data.shape[1]

            for i in trange(2, n_features):
                firefly = FireflyAlgorithm(
                    n_fireflies=FIREFLY_COUNTS,
                    n_features=n_features,
                    n_selected_features=i,
                    max_iter=FIREFLY_ITERATIONS,
                    device=device
                )
                selected_metrics, cost = firefly.run(data, use_tqdm=True, batch_size=9000)
                selected_metric_names = df.columns[selected_metrics.cpu().numpy()].tolist()
                methods_dict[key].append({
                    "selected_metrics": selected_metric_names,
                    "sammon_error": float(cost.cpu())
                })

            with open(f"{results_dir}/{key}_m.json", "w") as f_metrics:
                json.dump(methods_dict, f_metrics, indent=4)


Skip: ansible_ansible CL:  True
Skip: ansible_ansible CL:  False
Skip: AUTOMATIC1111_stable-diffusion-webui CL:  False
Skip: AUTOMATIC1111_stable-diffusion-webui CL:  True
Skip: certbot_certbot CL:  False
Skip: certbot_certbot CL:  True
Skip: aleju_imgaug CL:  False
Skip: aleju_imgaug CL:  True
Skip: babysor_MockingBird CL:  False
Skip: babysor_MockingBird CL:  True
Skip: aws_aws-cli CL:  True
Skip: aws_aws-cli CL:  False
Skip: apache_tvm CL:  False
Skip: apache_tvm CL:  True
Skip: apollos_opencv-practice CL:  True
Skip: apollos_opencv-practice CL:  False
Skip: borgbackup_borg CL:  True
Skip: borgbackup_borg CL:  False
Skip: aio-libs_aiohttp CL:  False
Skip: aio-libs_aiohttp CL:  True
Skip: django_django CL:  False
Skip: django_django CL:  True
Skip: CMSgov_bluebutton-web-server CL:  True
Skip: CMSgov_bluebutton-web-server CL:  False
Skip: dragonpilot-community_dp-devel CL:  False
Skip: dragonpilot-community_dp-devel CL:  True
Skip: commaai_openpilot CL:  True
Skip: commaai_openpilot C

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Processing: hankcs_HanLP. Is Class: True


  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Processing: gradio-app_gradio. Is Class: False


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Processing: gradio-app_gradio. Is Class: True


  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Processing: huggingface_pytorch-image-models. Is Class: False


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Processing: huggingface_pytorch-image-models. Is Class: True


  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Processing: huggingface_diffusers. Is Class: True


  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Processing: huggingface_diffusers. Is Class: False


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Processing: huggingface_transformers. Is Class: False


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import os

# Путь к директории Statistics/Results на Google Диске
RESULTS_DIR = '/content/drive/My Drive/Statistics/Results'

# Инициализация словарей
sum_sammon_error_c = {}
sum_sammon_error_m = {}
acceptance_by_sammon_error_votes_c = {}
acceptance_by_sammon_error_votes_m = {}
number_of_repos = 0

# Считываем файлы из директории Statistics/Results
for address, dirs, files in os.walk(RESULTS_DIR):
    for name in files:
        if name.endswith(".json"):  # Проверяем, что файл имеет формат JSON
            path = os.path.join(address, name)
            is_class_metric = not path.endswith("_m.json")
            #key = path.replace(RESULTS_DIR, "").replace("_m.json", "").replace(".json", "")
            key = path.replace(RESULTS_DIR, "").replace("_m.json", "").replace(".json", "").lstrip("\\/")


            # Чтение JSON-файла
            with open(path, 'r') as file:
                data = json.load(file)
                #print(data)
                metric_list = data[key]
                number_of_repos += 1
                acceptance = False

                # Обработка метрик
                for metric_data in metric_list:
                    num_metrics = len(metric_data["selected_metrics"])
                    s_error = metric_data["sammon_error"]

                    if is_class_metric:
                        # Суммирование ошибок Sammon для Class метрик
                        if num_metrics not in sum_sammon_error_c:
                            sum_sammon_error_c[num_metrics] = 0
                        sum_sammon_error_c[num_metrics] += s_error

                        # Голосование за метрики с Sammon Error < 1
                        if not acceptance and s_error < 1:
                            for metric in metric_data["selected_metrics"]:
                                if metric not in acceptance_by_sammon_error_votes_c:
                                    acceptance_by_sammon_error_votes_c[metric] = 0
                                acceptance_by_sammon_error_votes_c[metric] += 1
                        if s_error < 1:
                            acceptance = True
                    else:
                        # Суммирование ошибок Sammon для Method метрик
                        if num_metrics not in sum_sammon_error_m:
                            sum_sammon_error_m[num_metrics] = 0
                        sum_sammon_error_m[num_metrics] += s_error

                        # Голосование за метрики с Sammon Error < 1
                        if not acceptance and s_error < 1:
                            for metric in metric_data["selected_metrics"]:
                                if metric not in acceptance_by_sammon_error_votes_m:
                                    acceptance_by_sammon_error_votes_m[metric] = 0
                                acceptance_by_sammon_error_votes_m[metric] += 1
                        if s_error < 1:
                            acceptance = True

# Усреднение ошибок Sammon
sum_sammon_error_c_final = {key: value / number_of_repos for key, value in sum_sammon_error_c.items()}
sum_sammon_error_m_final = {key: value / number_of_repos for key, value in sum_sammon_error_m.items()}

# Результаты
print("Sum Sammon Error (Method):", sum_sammon_error_m_final)
print("Sum Sammon Error (Class):", sum_sammon_error_c_final)

# Лучшие метрики
class_metrics = sorted(acceptance_by_sammon_error_votes_c, key=acceptance_by_sammon_error_votes_c.get, reverse=True)[:12]
method_metrics = sorted(acceptance_by_sammon_error_votes_m, key=acceptance_by_sammon_error_votes_m.get, reverse=True)[:9]

print("Top Class Metrics:", class_metrics)
print("Top Method Metrics:", method_metrics)


Sum Sammon Error (Method): {2: 416847.1033995305, 3: 70513.33183590906, 4: 15523.76895011798, 5: 4586.198095543372, 6: 675.7651210440664, 7: 70.53805313235588, 8: 3.730942114885327, 9: 0.0662582579125693, 10: 0.01341229511586617, 11: 0.0006040187915601512}
Sum Sammon Error (Class): {2: 98890.71466907623, 3: 15791.611708416172, 4: 3394.3041303723853, 5: 981.2152807516969, 6: 397.17636558660456, 7: 116.73044109337043, 8: 35.663772792815095, 9: 5.063214236336063, 10: 1.5140175161073648, 11: 0.1829791138850134, 12: 0.02413752487687369, 13: 0.009163179347372891, 14: 0.0006074847311713547, 15: 0.0003909352570688941, 16: 3.078668264828875e-05, 17: 6.049689914307356e-06, 18: 1.7429224248921185e-06, 19: 4.055230145657001e-07, 20: 7.90405868265449e-08}
Top Class Metrics: ['CLOC', 'WMC', 'TLOC', 'CBO', 'NOS', 'LOC', 'LLOC', 'NOD', 'NOC', 'TNOS', 'RFC', 'NM']
Top Method Metrics: ['NUMPAR', 'NOI', 'McCC', 'TLOC', 'DLOC', 'TNOS', 'TCLOC', 'TLLOC', 'LLOC']
