In [119]:
from collections import defaultdict
from tqdm.notebook import tqdm
import re
import pandas as pd

In [138]:


def __process_line(line, is_data=False):
    if is_data:
        feature_data = line.strip().split(",")
        return {'features': feature_data}
        
    if line.startswith("@ATTRIBUTE class"):
        # Parse attribute information after "@ATTRIBUTE class"
        hierarchy_data = line.split(maxsplit=2)[2]
        labels = hierarchy_data.strip().split(",")
        labels[0] = labels[0].split(' ')[1]
        return {'labels': labels}
        # Process only @ATTRIBUTE lines for potential issues
    elif line.startswith("@ATTRIBUTE"):
        parts = line.split(maxsplit=2)  # Split only the first two parts
        if len(parts) < 3:
            print(f"Warning: Skipping malformed attribute line: {line.strip()}")

        attribute_name = parts[1]

        # Replace special characters in attribute names with underscores
        attribute_name = re.sub(r'[^a-zA-Z0-9_]', '_', attribute_name)

        return attribute_name
    elif line.startswith("@DATA"):
        # Parse attribute information after "@ATTRIBUTE class"
        data = line.split(maxsplit=2)
        return {'data': data}


def preprocess_arff_file(arff_file_path):
    """
    Temporarily preprocesses the .arff file to fix issues with attribute format and duplicate attribute names.

    :param arff_file_path: Path to the .arff file
    :return: Path to the temporary .arff file
    """
    with open(arff_file_path, 'r') as file:
        lines = [line.strip() for line in file.readlines() if line.strip()]

    
    processed_lines = []
    atributes_names = []
    attribute_names = defaultdict(int)
    is_data = False
    for line in tqdm(lines):
        line = __process_line(line, is_data=is_data)
        if line != ' ' and line != None:
            if type(line) != dict:
                atributes_names.append(line)
            else:
                if 'data' in line.keys():
                    is_data = True
                else:
                    processed_lines.append(line)

    # Write the modified lines to a temporary file
    #temp_file_path = arff_file_path + ".temp"
    #with open(temp_file_path, 'w') as file:
    #    file.writelines(processed_lines)

    return atributes_names, processed_lines

In [139]:
arff_file_path = "struc_GO.test.arff"

In [149]:
atributes_names, processed_lines = preprocess_arff_file(arff_file_path)

  0%|          | 0/20937 [00:00<?, ?it/s]

In [169]:
data = processed_lines[2]['features']

In [170]:
len(processed_lines)

1307

In [151]:
separated_examples = []
temp_example = []

In [171]:
for line in processed_lines:
    if 'features' in line.keys():
        for feature in line['features']:
            temp_example.append(feature)
            # Verifique se é o final de um exemplo com base no padrão categórico
            if "@" in feature:  # Ajuste esta condição conforme necessário para seu padrão
                separated_examples.append(temp_example)
                temp_example = []

In [172]:
len(atributes_names)

19628

In [173]:
atributes_names[1]

'struc2'

In [174]:
# Criar listas para armazenar os dados
features = []
categories = []

# Separar features e categorias
for example in tqdm(separated_examples):
    features.append([int(x) for x in example[:-1]])  # Converter features para inteiros
    categories.append(example[-1])  # Manter categorias como strings


In [175]:
len(features[-1])

19628

In [176]:
# Criar o DataFrame
df = pd.DataFrame(features, columns=atributes_names)
df["Categories"] = categories  # Adicionar a coluna de categorias

# Visualizar o DataFrame
print(df)

      struc1  struc2  struc3  struc4  struc5  struc6  struc7  struc8  struc9  \
0          0       0       0       0       0       0       0       0       0   
1          0       0       0       0       0       0       0       0       0   
2          0       0       1       0       0       0       0       0       0   
3          0       0       1       0       0       0       0       0       0   
4          0       0       0       0       0       0       0       0       0   
...      ...     ...     ...     ...     ...     ...     ...     ...     ...   
1302       0       0       0       0       0       0       0       0       0   
1303       0       0       0       0       0       0       0       0       0   
1304       0       0       0       0       0       0       0       0       0   
1305       0       0       0       0       0       0       0       0       0   
1306       0       0       0       0       0       0       0       0       0   

      struc10  ...  struc19620  struc19

In [177]:
df

Unnamed: 0,struc1,struc2,struc3,struc4,struc5,struc6,struc7,struc8,struc9,struc10,...,struc19620,struc19621,struc19622,struc19623,struc19624,struc19625,struc19626,struc19627,struc19628,Categories
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,GO0004519@GO0005739@GO0008150
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,GO0004519@GO0005739@GO0008150
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,GO0004519@GO0005739@GO0006314@GO0008380
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,GO0004519@GO0005739@GO0006316
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,GO0008121@GO0005750@GO0005739@GO0006122@GO0009060
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1302,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,GO0003702@GO0005654@GO0006555@GO0006289@GO0006...
1303,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,GO0003674@GO0005739@GO0005933@GO0043332@GO0000...
1304,0,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,1,GO0019237@GO0003704@GO0005634@GO0006369@GO0030...
1305,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,GO0015230@GO0005739@GO0015883
