# 1. Carga Dataset

import os
import requests

# Directory of the raw data files
_data_root = './data/covertype'
# Path to the raw training data
_data_filepath = os.path.join(_data_root, 'covertype_train.csv')

# Download data
os.makedirs(_data_root, exist_ok=True)
if not os.path.isfile(_data_filepath):
    url = 'https://docs.google.com/uc?export=\download&confirm={{VALUE}}&id=1lVF1BCWLH4eXXV_YOJzjR7xZjj-wAGj9'
    r = requests.get(url, allow_redirects=True, stream=True)
    open(_data_filepath, 'wb').write(r.content)


In [8]:
import pandas as pd 

df_inicial=pd.read_csv("data/covertype/covertype_train.csv")
df_inicial

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,2991,119,7,67,11,1015,233,234,133,1570,Commanche,C7202,1
1,2876,3,18,485,71,2495,192,202,144,1557,Commanche,C7757,1
2,3171,315,2,277,9,4374,213,237,162,1052,Rawah,C7745,0
3,3087,342,13,190,31,4774,193,221,166,752,Rawah,C7745,0
4,2835,158,10,212,41,3596,231,242,141,3280,Rawah,C4744,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
116198,3150,220,16,285,47,2275,200,253,187,866,Commanche,C7756,1
116199,3125,47,13,234,2,2430,224,212,120,1426,Rawah,C7745,0
116200,3166,152,11,67,0,1275,234,240,136,2404,Rawah,C7202,0
116201,3154,285,14,738,46,6012,181,239,198,1320,Rawah,C7745,1


# 2. Selección de características

Se listan las columnas del dataset

In [9]:
df_inicial.columns

Index(['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area', 'Soil_Type',
       'Cover_Type'],
      dtype='object')

Se divide el dataset en 3 dataframes diferentes, los cuales corresponderán a la información númerica, categorica y variable objetivo

In [24]:
df_inicial_numeric=df_inicial[[ 'Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways','Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm','Horizontal_Distance_To_Fire_Points']]

df_inicial_categoric=df_inicial[[ 'Wilderness_Area', 'Soil_Type' ]]

target=df_inicial[ 'Cover_Type' ]
target = target.astype("category")

target.value_counts()

1    56720
0    42307
2     7228
6     4045
5     3478
4     1892
3      533
Name: Cover_Type, dtype: int64

con la ayuda de las fucniones SelectKBest y f_classif de la librería se realizará la evaluación de cuales son las principales variables para el modelo

In [25]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif


# Definir cuántas características queremos seleccionar
k = 7  

# Aplicar SelectKBest con f_classif (ANOVA F-score)
selector = SelectKBest(score_func=f_classif, k=k)
X_selected = selector.fit_transform(df_inicial_numeric, target)

# Obtener los puntajes de cada característica
scores = selector.scores_

# Crear un DataFrame con los resultados
feature_scores = pd.DataFrame({
    'Feature': df_inicial_numeric.columns,
    'Score': scores
}).sort_values(by='Score', ascending=False)

# Imprimir los resultados
print("Puntajes de características numéricas:")
print(feature_scores)


Puntajes de características numéricas:
                              Feature         Score
0                           Elevation  31087.079657
5     Horizontal_Distance_To_Roadways   2050.314700
2                               Slope   1559.369722
9  Horizontal_Distance_To_Fire_Points   1452.737911
6                       Hillshade_9am    643.862634
7                      Hillshade_Noon    595.275348
3    Horizontal_Distance_To_Hydrology    488.221292
4      Vertical_Distance_To_Hydrology    241.029561
8                       Hillshade_3pm    190.497164
1                              Aspect     88.987497


Revisando los score resultantes y viendo cuales fueron las variables seleccionadas, se toma la decisión de eliminar las variables Vertical_Distance_To_Hydrology, Hillshade_3pm y Aspect, dado que no aportan al modelo

In [26]:
selected_features = df_inicial_numeric.columns[selector.get_support()]
print("Variables numéricas seleccionadas:", selected_features.tolist())


Variables numéricas seleccionadas: ['Elevation', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Horizontal_Distance_To_Fire_Points']


In [28]:
df_inicial_numeric=df_inicial_numeric.drop(columns=["Vertical_Distance_To_Hydrology", "Hillshade_3pm" , "Aspect"])

df_variables=pd.concat([df_inicial_numeric, df_inicial_categoric, target ],axis=1)
df_variables


Unnamed: 0,Elevation,Slope,Horizontal_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,2991,7,67,1015,233,234,1570,Commanche,C7202,1
1,2876,18,485,2495,192,202,1557,Commanche,C7757,1
2,3171,2,277,4374,213,237,1052,Rawah,C7745,0
3,3087,13,190,4774,193,221,752,Rawah,C7745,0
4,2835,10,212,3596,231,242,3280,Rawah,C4744,1
...,...,...,...,...,...,...,...,...,...,...
116198,3150,16,285,2275,200,253,866,Commanche,C7756,1
116199,3125,13,234,2430,224,212,1426,Rawah,C7745,0
116200,3166,11,67,1275,234,240,2404,Rawah,C7202,0
116201,3154,14,738,6012,181,239,1320,Rawah,C7745,1


# 3. Data Pipeline

## 3.1 Configurar el contexto interactivo

Se realiza la carga de las librerías necesarias para usar TFX

In [30]:
import tensorflow as tf

# Importar componentes de TFX
from tfx.components import CsvExampleGen, ExampleValidator, SchemaGen, StatisticsGen, Transform
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
from google.protobuf.json_format import MessageToDict

import os
import pprint
pp = pprint.PrettyPrinter()

2025-02-26 00:46:28.627251: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-26 00:46:28.656216: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-26 00:46:28.763085: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-26 00:46:28.900053: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-26 00:46:29.007126: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registe

Se configura el contexto interactivo, adicionalmente se indica la ruta en donde se guardarán los artefactos del pipeline

In [31]:
_pipeline_root = './pipeline/'

context = InteractiveContext(pipeline_root=_pipeline_root)



## 3.2. Generando Ejemplos

primero debemos guardar nuestra información en un archivo csv que pueda ser utilizado por TFX

In [32]:
# Definir la ruta donde se guardará el archivo CSV
data_dir = os.path.join(os.getcwd(), "data_tfx")
os.makedirs(data_dir, exist_ok=True)

csv_path = os.path.join(data_dir, "data.csv")

# Guardar el DataFrame como CSV (sin el índice para evitar problemas)
df_variables.to_csv(csv_path, index=False)

Se realiza la transformación de la información al formato requerido por TFX, adicionalmente se ejecuta el componente dentro del pipeline interactivo

In [34]:
example_gen = CsvExampleGen(input_base=data_dir)
context.run(example_gen)





0,1
.execution_id,1
.component,"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } CsvExampleGen at 0x7f94a3dc3370.inputs{}.outputs['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f94a3dc3ee0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: ./pipeline/CsvExampleGen/examples/1) at 0x7f952a8af040.type<class 'tfx.types.standard_artifacts.Examples'>.uri./pipeline/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0.exec_properties['input_base']/workspace/data_tfx['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }['output_data_format']6['output_file_format']5['custom_config']None['range_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:5194055,xor_checksum:1740531119,sum_checksum:1740531119"
.component.inputs,{}
.component.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f94a3dc3ee0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: ./pipeline/CsvExampleGen/examples/1) at 0x7f952a8af040.type<class 'tfx.types.standard_artifacts.Examples'>.uri./pipeline/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.inputs,{}
.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f94a3dc3ee0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: ./pipeline/CsvExampleGen/examples/1) at 0x7f952a8af040.type<class 'tfx.types.standard_artifacts.Examples'>.uri./pipeline/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"
.exec_properties,"['input_base']/workspace/data_tfx['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }['output_data_format']6['output_file_format']5['custom_config']None['range_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:5194055,xor_checksum:1740531119,sum_checksum:1740531119"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f94a3dc3ee0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: ./pipeline/CsvExampleGen/examples/1) at 0x7f952a8af040.type<class 'tfx.types.standard_artifacts.Examples'>.uri./pipeline/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: ./pipeline/CsvExampleGen/examples/1) at 0x7f952a8af040.type<class 'tfx.types.standard_artifacts.Examples'>.uri./pipeline/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: ./pipeline/CsvExampleGen/examples/1) at 0x7f952a8af040.type<class 'tfx.types.standard_artifacts.Examples'>.uri./pipeline/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,./pipeline/CsvExampleGen/examples/1
.span,0
.split_names,"[""train"", ""eval""]"
.version,0

0,1
['input_base'],/workspace/data_tfx
['input_config'],"{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }"
['output_config'],"{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }"
['output_data_format'],6
['output_file_format'],5
['custom_config'],
['range_config'],
['span'],0
['version'],
['input_fingerprint'],"split:single_split,num_files:1,total_bytes:5194055,xor_checksum:1740531119,sum_checksum:1740531119"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f94a3dc3ee0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: ./pipeline/CsvExampleGen/examples/1) at 0x7f952a8af040.type<class 'tfx.types.standard_artifacts.Examples'>.uri./pipeline/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: ./pipeline/CsvExampleGen/examples/1) at 0x7f952a8af040.type<class 'tfx.types.standard_artifacts.Examples'>.uri./pipeline/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: ./pipeline/CsvExampleGen/examples/1) at 0x7f952a8af040.type<class 'tfx.types.standard_artifacts.Examples'>.uri./pipeline/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,./pipeline/CsvExampleGen/examples/1
.span,0
.split_names,"[""train"", ""eval""]"
.version,0


Se obtiene el artefacto, en donde se verifica los nombres de los conjuntos en los que se ha dividido la data, adicionalmente la ruta donde se está guardando el pipeline inicial

In [35]:
artifact = example_gen.outputs['examples'].get()[0]

print(f'split names: {artifact.split_names}')
print(f'artifact uri: {artifact.uri}')

split names: ["train", "eval"]
artifact uri: ./pipeline/CsvExampleGen/examples/1


Se verifica la ruta donde se tienen los datos de entrenamiento y el nombre del archivo con la data generada

In [36]:
train_uri = os.path.join(artifact.uri, 'Split-train')

!ls {train_uri}

data_tfrecord-00000-of-00001.gz


Se cargan los ejemplos de entrenamiento en un dataset

In [37]:
# Get the list of files in this directory (all compressed TFRecord files)
tfrecord_filenames = [os.path.join(train_uri, name)
                      for name in os.listdir(train_uri)]

# Create a `TFRecordDataset` to read these files
dataset = tf.data.TFRecordDataset(tfrecord_filenames, compression_type="GZIP")

Se define una función que permite visualizar los n primeros registros del dataset seleccionado, posteriormente se imprimen los 3 primeros datos

In [41]:
# Define a helper function to get individual examples
def get_records(dataset, num_records):
    '''Extracts records from the given dataset.
    Args:
        dataset (TFRecordDataset): dataset saved by ExampleGen
        num_records (int): number of records to preview
    '''
    
    # initialize an empty list
    records = []
    
    # Use the `take()` method to specify how many records to get
    for tfrecord in dataset.take(num_records):
        
        # Get the numpy property of the tensor
        serialized_example = tfrecord.numpy()
        
        # Initialize a `tf.train.Example()` to read the serialized data
        example = tf.train.Example()
        
        # Read the example data (output is a protocol buffer message)
        example.ParseFromString(serialized_example)
        
        # convert the protocol bufffer message to a Python dictionary
        example_dict = (MessageToDict(example))
        
        # append to the records list
        records.append(example_dict)
        
    return records


sample_records = get_records(dataset, 3)
pp.pprint(sample_records)    

[{'features': {'feature': {'Cover_Type': {'int64List': {'value': ['1']}},
                           'Elevation': {'int64List': {'value': ['2876']}},
                           'Hillshade_9am': {'int64List': {'value': ['192']}},
                           'Hillshade_Noon': {'int64List': {'value': ['202']}},
                           'Horizontal_Distance_To_Fire_Points': {'int64List': {'value': ['1557']}},
                           'Horizontal_Distance_To_Hydrology': {'int64List': {'value': ['485']}},
                           'Horizontal_Distance_To_Roadways': {'int64List': {'value': ['2495']}},
                           'Slope': {'int64List': {'value': ['18']}},
                           'Soil_Type': {'bytesList': {'value': ['Qzc3NTc=']}},
                           'Wilderness_Area': {'bytesList': {'value': ['Q29tbWFuY2hl']}}}}},
 {'features': {'feature': {'Cover_Type': {'int64List': {'value': ['0']}},
                           'Elevation': {'int64List': {'value': ['3171']}},
 

2025-02-26 01:10:09.346458: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


## 3.3. Estadísticas

## 3.4. Inferir el esquema

## 3.5. Curando el esquema

## 3.6. Entornos de esquema

## 3.7. Genere nuevas estadísticas usando el esquema actualizado

## 3.8. Comprobar anomalías

## 3.9. Ingeniería de características

## 3.10. Función de preprocesamiento

## 3.11. Transformar