In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import pywt
from joblib import Parallel, delayed
import gc

In [2]:
df_3 = pd.read_parquet('test_3.parquet', engine='pyarrow')
df_3.head()

Unnamed: 0,fecha_hora,Ro_1a,Ro_2a,Ro_3a,Ro_4a
0,2004-03-04 09:27:46,0.034,0.264,0.039,-0.046
1,2004-03-04 09:27:46,0.103,0.083,-0.061,-0.012
2,2004-03-04 09:27:46,0.095,-0.039,-0.007,0.039
3,2004-03-04 09:27:46,0.0,0.11,0.022,-0.002
4,2004-03-04 09:27:46,0.005,0.154,-0.127,-0.02


In [3]:
df_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129515520 entries, 0 to 129515519
Data columns (total 5 columns):
 #   Column      Dtype         
---  ------      -----         
 0   fecha_hora  datetime64[ns]
 1   Ro_1a       float64       
 2   Ro_2a       float64       
 3   Ro_3a       float64       
 4   Ro_4a       float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 4.8 GB


In [4]:
# Convertir columnas a float antes del procesamiento
cols = ['Ro_1a', 'Ro_2a',  'Ro_3a',  'Ro_4a']

In [None]:
# Definir funciones de métricas

# Calcula la media de los datos por archivo
def calculo_media(data):
    return np.mean(np.abs(data))

# Calcula Root Mean Square
def calculo_rms(data):
    return np.sqrt(np.mean(data ** 2))  # RMS correcto

# Calcula Crest Factor (Detecta golpes e impactos inusuales)
def calculo_fc(data):
    rms = calculo_rms(data)
    return np.max(np.abs(data)) / rms if rms != 0 else np.nan  # Crest Factor correcto

# Calcula Kurtosis (picos anormales indica fallos bruscos)
def calculo_kurtosis(data):
    return stats.kurtosis(data, fisher=True, bias=False)


# Función para procesar cada grupo de fecha_hora
def proceso_1(date, group):
    row = {'fecha_hora': date}
    for ro in cols:
        ro_data = group[ro].values  # Convertir a numpy para eficiencia
        row.update({
            f'{ro}_Media': calculo_media(ro_data),
            f'{ro}_FC': calculo_fc(ro_data),  # Ahora usa RMS internamente
            f'{ro}_K': calculo_kurtosis(ro_data),
        })
    return row

In [None]:
# Definir el tamaño del chunk
chunk_size = 40000000  # 40 millones

# Dividir los datos en bloques de tamaño chunk_size
chunks = [df_3[i:i + chunk_size] for i in range(0, len(df_3), chunk_size)]

# Definir el número de núcleos a usar
num_cores = -1  # Usa todos los núcleos disponibles
results = []

for chunk in chunks:
    # Procesar cada bloque de datos de manera paralela
    chunk_results = Parallel(n_jobs=num_cores)(
        delayed(proceso_1)(date, group) for date, group in chunk.groupby('fecha_hora')
    )
    results.extend(chunk_results)

    # Liberar memoria después de procesar cada bloque
    del chunk
    gc.collect()

# Convertir los resultados a un DataFrame final
df_test3 = pd.DataFrame(results)

In [10]:
df_test3.tail()

Unnamed: 0,fecha_hora,Ro_1a_Media,Ro_1a_FC,Ro_1a_K,Ro_2a_Media,Ro_2a_FC,Ro_2a_K,Ro_3a_Media,Ro_3a_FC,Ro_3a_K,Ro_4a_Media,Ro_4a_FC,Ro_4a_K
6322,2004-04-18 02:02:55,0.122481,4.793745,0.284781,0.188264,3.779288,-0.10244,0.34276,10.093598,16.748551,0.195163,3.849553,0.022865
6323,2004-04-18 02:12:55,0.110247,4.446217,0.035702,0.166032,4.2996,-0.039499,0.331774,8.579299,5.70981,0.205829,3.619096,-0.257389
6324,2004-04-18 02:22:55,0.115788,5.122955,0.55345,0.201606,3.446939,-0.230277,0.4043,8.4739,12.664939,0.222182,3.944782,0.030796
6325,2004-04-18 02:32:55,0.11991,6.039545,1.094308,0.199326,5.023323,0.65883,0.543762,6.589654,7.06192,0.229032,3.434047,-0.485728
6326,2004-04-18 02:42:55,0.001533,2.615004,1.3005,0.002441,2.509546,0.33098,0.003663,1.765154,-1.82658,0.001428,2.702101,1.01732


In [9]:
df_test3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6327 entries, 0 to 6326
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   fecha_hora   6327 non-null   datetime64[ns]
 1   Ro_1a_Media  6327 non-null   float64       
 2   Ro_1a_FC     6327 non-null   float64       
 3   Ro_1a_K      6327 non-null   float64       
 4   Ro_2a_Media  6327 non-null   float64       
 5   Ro_2a_FC     6327 non-null   float64       
 6   Ro_2a_K      6327 non-null   float64       
 7   Ro_3a_Media  6327 non-null   float64       
 8   Ro_3a_FC     6327 non-null   float64       
 9   Ro_3a_K      6327 non-null   float64       
 10  Ro_4a_Media  6327 non-null   float64       
 11  Ro_4a_FC     6327 non-null   float64       
 12  Ro_4a_K      6327 non-null   float64       
dtypes: datetime64[ns](1), float64(12)
memory usage: 642.7 KB


In [11]:
df_test3.to_csv('test_3.csv', index=False)