In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import pywt
from joblib import Parallel, delayed

In [None]:
df_1 = pd.read_parquet('test_1.parquet', engine='pyarrow')
df_1.head()

In [None]:
df_1.info()

In [None]:
# Convertir columnas a float antes del procesamiento
cols = ['Ro_1a', 'Ro_1b', 'Ro_2a', 'Ro_2b', 'Ro_3a', 'Ro_3b', 'Ro_4a', 'Ro_4b']
df_1[cols] = df_1[cols].astype(float)

# Definir funciones de métricas

# Calcula la media de los datos por archivo
def calculo_media(data):
    return np.mean(np.abs(data))

# Calcula Root Mean Square
def calculo_rms(data):
    return np.sqrt(np.mean(data ** 2))  # RMS correcto

# Calcula Crest Factor (Detecta golpes e impactos inusuales)
def calculo_fc(data):
    rms = calculo_rms(data)
    return np.max(np.abs(data)) / rms if rms != 0 else np.nan  # Crest Factor correcto

# Calcula Kurtosis (picos anormales indica fallos bruscos)
def calculo_kurtosis(data):
    return stats.kurtosis(data, fisher=True, bias=False)


# Función para procesar cada grupo de fecha_hora
def proceso_1(date, group):
    row = {'fecha_hora': date}
    for ro in cols:
        ro_data = group[ro].values  # Convertir a numpy para eficiencia
        row.update({
            f'{ro}_Media': calculo_media(ro_data),
            f'{ro}_FC': calculo_fc(ro_data),  # Ahora usa RMS internamente
            f'{ro}_K': calculo_kurtosis(ro_data),
        })
    return row

In [None]:
# Paralelizar el procesamiento por grupos de fecha_hora
num_cores = -1  # Usa todos los núcleos disponibles
results = Parallel(n_jobs=num_cores)(
    delayed(proceso_1)(date, group) for date, group in df_1.groupby('fecha_hora')
)

# Convertir resultados a DataFrame
df_test1 = pd.DataFrame(results)

In [None]:
df_test1.head()

In [None]:
df_test1.describe()

In [None]:
df_test1.to_csv('test_1.csv', index=False)

In [3]:
df_2 = pd.read_parquet('test_2.parquet', engine='pyarrow')
df_2.head()

Unnamed: 0,fecha_hora,Ro_1a,Ro_2a,Ro_3a,Ro_4a
0,2004-02-12 10:32:39,-0.049,-0.071,-0.132,-0.01
1,2004-02-12 10:32:39,-0.042,-0.073,-0.007,-0.105
2,2004-02-12 10:32:39,0.015,0.0,0.007,0.0
3,2004-02-12 10:32:39,-0.051,0.02,-0.002,0.1
4,2004-02-12 10:32:39,-0.107,0.01,0.127,0.054


In [4]:
df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20152320 entries, 0 to 20152319
Data columns (total 5 columns):
 #   Column      Dtype         
---  ------      -----         
 0   fecha_hora  datetime64[ns]
 1   Ro_1a       float64       
 2   Ro_2a       float64       
 3   Ro_3a       float64       
 4   Ro_4a       float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 768.8 MB


In [5]:
# Convertir columnas a float antes del procesamiento
cols = ['Ro_1a', 'Ro_2a',  'Ro_3a',  'Ro_4a']


# Paralelizar el procesamiento por grupos de fecha_hora
num_cores = -1  # Usa todos los núcleos disponibles
results = Parallel(n_jobs=num_cores)(
    delayed(proceso_1)(date, group) for date, group in df_2.groupby('fecha_hora')
)

# Convertir resultados a DataFrame
df_test2 = pd.DataFrame(results)

In [6]:
df_test2.head()

Unnamed: 0,fecha_hora,Ro_1a_Media,Ro_1a_FC,Ro_1a_K,Ro_2a_Media,Ro_2a_FC,Ro_2a_K,Ro_3a_Media,Ro_3a_FC,Ro_3a_K,Ro_4a_Media,Ro_4a_FC,Ro_4a_K
0,2004-02-12 10:32:39,0.058332,6.120331,0.629209,0.071832,5.640841,0.507217,0.083244,9.350652,3.214152,0.043065,4.879539,0.066268
1,2004-02-12 10:42:39,0.058997,5.147086,0.648742,0.074008,5.148852,0.253369,0.084439,6.847735,1.395884,0.04454,4.438435,0.107859
2,2004-02-12 10:52:39,0.060239,6.598472,0.513894,0.074223,5.367171,0.311158,0.083922,8.492546,2.640886,0.044442,6.305078,0.257592
3,2004-02-12 11:02:39,0.061453,7.723217,1.158529,0.073843,6.121762,0.235691,0.084462,9.180734,2.683727,0.045081,9.241749,0.80668
4,2004-02-12 11:12:39,0.061361,4.982524,0.603617,0.075606,5.09712,0.226657,0.082837,7.171719,1.579073,0.045118,4.29268,0.139281


In [7]:
df_test2.describe()

Unnamed: 0,fecha_hora,Ro_1a_Media,Ro_1a_FC,Ro_1a_K,Ro_2a_Media,Ro_2a_FC,Ro_2a_K,Ro_3a_Media,Ro_3a_FC,Ro_3a_K,Ro_4a_Media,Ro_4a_FC,Ro_4a_K
count,984,984.0,984.0,984.0,984.0,984.0,984.0,984.0,984.0,984.0,984.0,984.0,984.0
mean,2004-02-15 20:27:39.000000384,0.080905,5.17387,0.827633,0.078532,4.868481,0.153373,0.081356,7.43793,1.330865,0.047822,4.627472,0.141984
min,2004-02-12 10:32:39,0.001168,2.377543,-1.609874,0.000767,1.614597,-1.871203,0.000716,1.772075,-1.935598,0.001699,2.321622,-0.341093
25%,2004-02-14 03:30:09,0.060764,4.816583,0.394137,0.074232,4.484052,0.112746,0.07683,6.581015,1.073379,0.043949,4.29859,0.086687
50%,2004-02-15 20:27:39,0.062005,5.097702,0.485199,0.075202,4.785188,0.158922,0.078197,7.398998,1.323468,0.044523,4.5407,0.128979
75%,2004-02-17 13:25:09,0.083222,5.440354,0.879196,0.077462,5.156038,0.206141,0.080597,8.248309,1.653605,0.04811,4.816952,0.175185
max,2004-02-19 06:22:39,0.453318,9.33327,14.113748,0.161011,7.929995,2.453631,0.151296,12.751776,3.702443,0.119042,9.241749,1.653418
std,,0.040171,0.576209,0.966023,0.011779,0.594937,0.145699,0.011596,1.278729,0.582875,0.009541,0.526146,0.118357


In [8]:
df_test2.to_csv('test_2.csv', index=False)