## Dependences

In [None]:
import warnings
import time

# Suprimir todos os avisos
warnings.filterwarnings("ignore")

## Load Datasets & Utils Functions

In [None]:
def write_dict_to_csv(filename, data_dict):
    import os
    import csv
    # Check if the file exists
    file_exists = os.path.exists(filename)

    # Open the CSV file in append mode so that existing data isn't overwritten
    with open(filename, mode='a', newline='') as file:
        fieldnames = data_dict.keys()
        writer = csv.DictWriter(file, fieldnames=fieldnames)

        # If the file doesn't exist, write the header
        if not file_exists:
            writer.writeheader()

        # Write the dictionary as a new row
        writer.writerow(data_dict)

In [None]:
def time_stamp():
    import time
    return time.strftime("%H%M_%d%m%Y")


In [None]:
def load_susy(debug=False):
    """Load the SUSY dataset into data and target dictionaries"""

    import requests
    import zipfile
    import io
    import pandas as pd
    from tqdm import tqdm
    import gzip

    url = 'https://archive.ics.uci.edu/static/public/279/susy.zip'
    response = requests.get(url, stream=True)

    if response.status_code == 200:
        total_size = int(response.headers.get('content-length', 0))
        block_size = 1024
        t = tqdm(total=total_size, unit='iB', unit_scale=True)

        file_buffer = io.BytesIO()
        for data in response.iter_content(block_size):
            t.update(len(data))
            file_buffer.write(data)
        t.close()

        file_buffer.seek(0)

        with zipfile.ZipFile(file_buffer) as the_zip:
            with the_zip.open('SUSY.csv.gz') as gz_file:
                with gzip.open(gz_file) as the_file:
                    # Load the full CSV into a DataFrame
                    the_file.seek(0)
                    df = pd.read_csv(the_file, names=[str(i) for i in range(0,19)])

                    # Rename the first column to 'target' and separate it as a Series
                    target = df.iloc[:, 0]
                    target.name = 'target'

                    # Get the remaining columns as a DataFrame
                    data = df.iloc[:, 1:]

                    if debug:
                        print("===========LOAD DATASET============")
                        print("Data list shape: ", data.shape)
                        print(data.head())


                    return data, target

    else:
        print(f"Failed to download the file. Status code: {response.status_code}")
        return None

In [None]:
def load_data(data, debug=False):
    """
    Load the dataset into data and target dictionaries

    Parameters
    ----------

    data : str
        Path to the data


    """
    data = pd.read_csv(data, header=None, sep = ' ')

    if debug:
        print("===========LOAD DATA============")
        print("Data list shape: ", data.shape)
        print(data.head())

    return data

## Build Dataset Class

In [None]:
import pandas as pd
import numpy as np




class make_kmeans_dataset():

    def __init__(self, data, thread_list, dim, k):
        self.data = data
        self.thread_list = thread_list
        self.dim = dim
        self.k = k




    def get_cluster(self,att: list, centroids: list = None ,):
        """Give a list of centroids points and  a list of values, return the label from
        what cluster the data is. Uses numpy for faster computation

            Parameters
            ----------

            centroids : list
                List of centroids points
                [centroid1, centroid2, centroid3, centroid4, ...]
                Note that centroid1, centroid2, centroid3 can be a point in space (if dim == 3)
                that will be used to get the distance between the data_values and the centroids

            att: list
                A list of selected attribuites that will be used to generate the
                new coloumn

            Returns
            -------
            cluster : list
                The index of the cluster that eand point in data_values belongs to

        """

        data_values = self.data[att]


        # Reshape the centroids to a 2D array with dim columns
        if centroids is not None:
            centroids = np.array(centroids).reshape(-1, data_values.shape[1])
        else:
            raise ValueError("Centroids is None")

        # Convert data_values to a CuPy array
        data_values = np.array(data_values.values)

        # Get the distance between the data_values and
        # the centroids
        # Calculate the distance between the data_values and the centroids
        distances = np.linalg.norm(data_values[:, None] - centroids, axis=2)

        # Get the index of the minimum distance for each point
        clusters = np.argmin(distances, axis=1)

        return clusters.get().tolist()  # Convert the CuPy array back to a list for compatibility

    def select_rows(self, n_rows: int = 1, method = 'random' , row_id=None, debug=False):
        """
        Select the rows that will be used to create the new columns

        Parameters
        ----------

        n_rows : int = 1
            The number of rows that will be selected

        debug : bool = False
            If True, print debug information

        row_id : list = None
            The row id that will be used to create the new columns
            If None, a random row will be selected

        Returns
        -------
        thread_row : pd.DataFrame
            The selected rows

        row_id : list
            The row id that was used to create the new columns

        """

        if row_id is None:
            if method == 'random':
                thread_row = self.thread_list.sample(n=n_rows)
                while thread_row.iloc[:, 0:self.dim].duplicated().any():
                    thread_row = self.thread_list.sample(n=n_rows)

            elif method == 'sequential':
                thread_row = self.thread_list.iloc[0:n_rows]

            elif method == 'gini':
                # Get the n_rows rows with the smallest sum of gini (last 8 columns)
                thread_row = self.thread_list.nsmallest(n_rows, self.thread_list.columns[-self.k:])
                while thread_row.iloc[:, 0:self.dim].duplicated().any():
                    thread_row = self.thread_list.nsmallest(n_rows, self.thread_list.columns[-self.k:])

            row_id = thread_row.index.to_arrow().to_pylist()



        else:
            thread_row = self.thread_list.iloc[row_id]

        if debug:
            print("Row id: ", row_id)
            print("=====================================")

        return thread_row, row_id



    def build_new_columns(self, n_columns: int =1, row_id = None, method = 'random', debug=False):
        """Build the new columns for the dataset

        Parameters
        ----------

        n_columns : int = 1
            The number of columns that will be created

        row_id : list = None
            The row id that will be used to create the new columns
            If None, a random row will be selected

        debug : bool = False
            If True, print debug information

        Returns
        -------
        new_data : pd.DataFrame
            The new dataset with the new columns

        row_id : list
            The row id that was used to create the new columns




        """

        new_data   = pd.DataFrame()
        thread_row, row_id = self.select_rows(n_columns, debug=debug,
                                              row_id=row_id, method = method)

        if debug:
            print("============BUILD COLUMN=========")
            print(thread_row)

        for i in range(n_columns):
            # Get the centroids
            centroids = thread_row.iloc[i, self.dim:self.dim+(self.dim*self.k)].values.tolist()
            # Get the used attributes
            att       = [str(i) for i in thread_row.iloc[i, 0:self.dim].values.tolist()]

            if debug:
                print("Centroids: ", centroids)
                print("Attributes: ", att)

            labels = self.get_cluster(att, centroids)

            # Create the new columns
            new_data[f'cluster_{i}'] = labels


        return new_data,row_id




## Import Data

In [None]:
# Load the dataset
# thread_list = load_data('./misc/data/Kmedoids10k.csv', debug=True)
data,target = load_susy(debug = True)
target = target.astype('float32')



922MiB [03:26, 4.46MiB/s] 


Data list shape:  (5000000, 18)
          1         2         3         4         5         6         7  \
0  0.972861  0.653855  1.176225  1.157156 -1.739873 -0.874309  0.567765   
1  1.667973  0.064191 -1.225171  0.506102 -0.338939  1.672543  3.475464   
2  0.444840 -0.134298 -0.709972  0.451719 -1.613871 -0.768661  1.219918   
3  0.381256 -0.976145  0.693152  0.448959  0.891753 -0.677328  2.033060   
4  1.309996 -0.690089 -0.676259  1.589283 -0.693326  0.622907  1.087562   

          8         9        10        11        12        13        14  \
0 -0.175000  0.810061 -0.252552  1.921887  0.889637  0.410772  1.145621   
1 -1.219136  0.012955  3.775174  1.045977  0.568051  0.481928  0.000000   
2  0.504026  1.831248 -0.431385  0.526283  0.941514  1.587535  2.024308   
3  1.533041  3.046260 -1.005285  0.569386  1.015211  1.582217  1.551914   
4 -0.381742  0.589204  1.365479  1.179295  0.968218  0.728563  0.000000   

         15        16        17        18  
0  1.932632  0.994464 

In [None]:
thread_list = load_data('./misc/data/kmeans10k.csv', debug=True)
target = target.astype('float32')

Data list shape:  (10240, 35)
   0   1   2    3    4    5         6         7         8         9   ...  \
0  14   5  15  0.0  0.0  0.0  0.525004 -1.337430  0.969126  0.237978  ...   
1   2  17  14  0.0  0.0  0.0 -0.958958  1.215950  0.364479  0.509009  ...   
2   6   1   8  0.0  0.0  0.0 -0.571164  1.226540 -1.139080  1.083150  ...   
3  14   9  15  0.0  0.0  0.0  0.634961  0.783559  2.415120  1.645030  ...   
4  11   1  18  0.0  0.0  0.0  0.000000  0.000000  0.000000  2.612770  ...   

         25        26   27        28        29        30        31        32  \
0 -0.849737  0.909070  0.0  0.459118  0.491110  0.433444  0.493343  0.417547   
1  1.043310  1.893040  0.0  0.490649  0.498953  0.480121  0.497741  0.498214   
2  0.844672  0.940628  0.0  0.498010  0.479752  0.483604  0.499915  0.493842   
3  0.266001  0.501657  0.0  0.470673  0.014858  0.429709  0.495462  0.436205   
4  1.099090  0.258224  0.0  0.000000  0.294411  0.000000  0.445393  0.296207   

         33        34  
0 

## Build Dataset

In [None]:
# Criação do objeto kmeans (ou outro método, dependendo do JSON)
_k, _dim = 8,3
# Verificando se as colunas em thread_list 1,2,3 são iguais a 0,6,14
_row_id = thread_list[(thread_list[0] == 0) & (thread_list[1] == 6) & (thread_list[2] == 14)].index.to_list()
_debug = False
kmeans = make_kmeans_dataset(data, thread_list, _k, _dim)

new_data, index = kmeans.build_new_columns(n_columns=3, row_id=_row_id,
                                               method='random', debug=_debug)

## Run RandomForest

In [None]:
def run_random_forest(new_data, target, _n_estimators, _max_depth, debug=False):
      from cuml.ensemble import RandomForestClassifier
      from cuml.metrics import accuracy_score
      from cuml.model_selection import train_test_split
      import cudf

        # Converter os dados para um DataFrame do cudf
      X_cudf = new_data
      y_cudf = cudf.Series(target)

      # Dividir os dados em conjuntos de treinamento e teste
      X_train, X_test, y_train, y_test = train_test_split(X_cudf, y_cudf, test_size=0.2, random_state=42)


      # Instanciar e treinar o modelo RandomForest
      rf = RandomForestClassifier(n_estimators=_n_estimators,
                                  max_depth = _max_depth,
                                  random_state=42)
      rf.fit(X_train, y_train)

      # Fazer previsões no conjunto de teste
      y_pred = rf.predict(X_test)

      # Calcular a acurácia
      accuracy = accuracy_score(y_test, y_pred)
      if debug:
        print(f'Acurácia do modelo RandomForest {_n_estimators},{_max_depth}: {accuracy}')

      return accuracy

In [None]:
# Carregando o JSON
import json
with open('config.json','r') as file:
    config_json = file.read()

config = json.loads(config_json)

# Extraindo os parâmetros do JSON
_id_start            = config['id_start']
_k                   = config['k']
_dim                 = config['dim']
_num_bench           = config['num_bench']
_n_columns_benchmark = (config['n_coluns']*_num_bench)
_row_id_benchmark    = [[429] for i in range(10000)]
_method              = config['method']
_benchmark           = config['n_estimators']
_file_name           = config['file_name']
_debug               = config['debug']

_n_columns_benchmark.sort()

# Criação do objeto kmeans (ou outro método, dependendo do JSON)
kmeans = make_kmeans_dataset(data, thread_list, _k, _dim)

data_original = pd.DataFrame(data)

if not _row_id_benchmark:
    _row_id_benchmark =  [None for i in range(len(_n_columns_benchmark)*_num_bench)]
# Loop pelos parâmetros do benchmark
for _n_columns, _row_id in zip(_n_columns_benchmark, _row_id_benchmark):
    # Construção das novas colunas e medição de tempo
    start = time.time()
    new_data, index = kmeans.build_new_columns(n_columns=_n_columns, row_id=_row_id,
                                               method=_method, debug=_debug)
    end = time.time()

    new_data = new_data.astype('float32')


    ref_string = f"\t\t====Starting Random Forest Benchmark {_id_start}===="
    print(ref_string)


    for _n_estimators, _n_depth in _benchmark:
        print(f'====n_estimators: {_n_estimators}, n_depth: {_n_depth}====')
        start_rf = time.time()
        acc = run_random_forest(new_data, target, _n_estimators, _n_depth, debug=_debug)
        end_rf = time.time()
        acc_real = run_random_forest(data_original, target, _n_estimators, _n_depth, debug=_debug)
        print(f"acc: {acc}, time: {(end_rf - start_rf)}")
        diff = (acc_real - acc)
        point = {
                "id": _id_start,
                'k': _k,
                'dim': _dim,
                'n_columns': _n_columns,
                'att': index,
                'method': _method,
                "n_estimators": _n_estimators,
                "n_depth": _n_depth,
                "acc": acc,
                "n_nodes": '-',
                "time": (end-start)+ (end_rf-start_rf),
                'diff': diff
        }



        write_dict_to_csv(f"./results/{_file_name}", point)

    target_string = "=========================================="
    formatted_string = target_string.center(len(ref_string), '=')
    print(formatted_string)


    _id_start += 1