In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt  #Importamos librerías para usar
import scipy as scp 
from scipy import stats
import seaborn as sns
a = pd.read_csv("FLIR_groups1and2.csv", sep=";" , header=2) #cargamos el dataframe
#Nos quedamos solamente con las columnas de interés para el proyecto
df = a[['Max1R13_1', 'Max1R13_2', 'Max1R13_3', 'Max1R13_4', 'aveOralM', 
        'Gender', 'Age', 'Ethnicity', 'T_atm', 'Humidity', 'Cosmetics']].copy()

#creamos la nueva columna con el valor promediado de Max1R13
df['Max1R13'] = df[['Max1R13_1', 'Max1R13_2', 'Max1R13_3', 'Max1R13_4']].mean(axis=1, skipna=True).astype(float)
#Eliminamos las columnas de Max1R13 que no son la promediada
df.drop(columns=['Max1R13_1', 'Max1R13_2', 'Max1R13_3', 'Max1R13_4'],inplace=True) 

print(df.isnull().sum()) #Corroboramos si hay datos nulos

#Veamos el comportamiento de los valores nulos de Cosmetics y como podemos tratarlos
print(f"\nMediana de cosmeticos: {a["Cosmetics"].median()}")

b=df['Cosmetics'].fillna(0.0)
print(f"Mediana de cosmeticos imputando con la mediana: {b.median()}")

#procedemos a imputar los valores nulos con la mediana

df["Cosmetics"] = df["Cosmetics"].fillna(df['Cosmetics'].median())
df["Cosmetics"] = df["Cosmetics"].map({0.0: "No", 1.0: "Sí"})
df.info()

aveOralM      0
Gender        0
Age           0
Ethnicity     0
T_atm         0
Humidity      0
Cosmetics    29
Max1R13       0
dtype: int64

Mediana de cosmeticos: 0.0
Mediana de cosmeticos imputando con la mediana: 0.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1020 entries, 0 to 1019
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   aveOralM   1020 non-null   float64
 1   Gender     1020 non-null   object 
 2   Age        1020 non-null   object 
 3   Ethnicity  1020 non-null   object 
 4   T_atm      1020 non-null   float64
 5   Humidity   1020 non-null   float64
 6   Cosmetics  1020 non-null   object 
 7   Max1R13    1020 non-null   float64
dtypes: float64(4), object(4)
memory usage: 63.9+ KB


In [31]:
# Seleccionar solo las columnas numéricas
a_num = df.select_dtypes(include=[np.number])

# Crear un DataFrame con las métricas estadísticas
metricas1 = pd.DataFrame({
    'media': a_num.mean(),
    'mediana': a_num.median(),
    
    'SD': a_num.std(),
    
    'varianza': a_num.var(),
    'IQR': a_num.apply(lambda x: np.percentile(x, 75) - np.percentile(x,50)- np.percentile(x, 25)),
    'CV': a_num.std() / a_num.mean(),
    'CVM': (a_num.median() - a_num.mean()) / a_num.std()
})

# Mostrar resultados
print(metricas1)


              media   mediana         SD    varianza       IQR        CV  \
aveOralM  37.028382  36.94000   0.509502    0.259593 -36.57750  0.013760   
T_atm     24.115392  24.00000   1.336338    1.785798 -22.70000  0.055414   
Humidity  28.723039  26.30000  13.071627  170.867427  -7.70000  0.455092   
Max1R13   35.596533  35.54875   0.574888    0.330496 -34.92375  0.016150   

               CVM  
aveOralM -0.173468  
T_atm    -0.086350  
Humidity -0.185366  
Max1R13  -0.083117  


In [32]:
from scipy.stats import median_abs_deviation
# Columnas numéricas
df_num = df.select_dtypes(include=[np.number])

# Columnas categóricas (excluimos las numéricas)
df_cat = df.select_dtypes(exclude=[np.number])

metricas_num = pd.DataFrame({
    'Media': df_num.mean(),
    'Mediana': df_num.median(),
    'Moda': df_num.mode().iloc[0],
    'SD': df_num.std(),
    'MAD': median_abs_deviation(df_num),
    'Varianza': df_num.var(),
    'IQR': df_num.quantile(0.75) - df_num.quantile(0.25),
    'CV %': (df_num.std() / df_num.median()),
    'CVM %': (median_abs_deviation(df_num) / df_num.median())
}).T

# Métricas para columnas categóricas: solo la Moda es significativa
indices = ["Media", "Mediana", "Moda", "SD", "MAD", "Varianza", "IQR", "CV %", "CVM %"]
metricas_cat = pd.DataFrame(index=indices, columns=df_cat.columns)
metricas_cat.loc["Moda"] = df_cat.apply(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)

# Unir ambos DataFrames (las columnas no se traslapan)
metricas_completas = pd.concat([metricas_num, metricas_cat], axis=1)

# Reordenar las columnas según el orden original de df
metricas_completas = metricas_completas[df.columns]

print(metricas_completas)

           aveOralM  Gender    Age Ethnicity      T_atm    Humidity Cosmetics  \
Media     37.028382     NaN    NaN       NaN  24.115392   28.723039       NaN   
Mediana   36.940000     NaN    NaN       NaN  24.000000   26.300000       NaN   
Moda      36.890000  Female  18-20     White  24.000000   30.000000        No   
SD         0.509502     NaN    NaN       NaN   1.336338   13.071627       NaN   
MAD        0.200000     NaN    NaN       NaN   0.600000    8.900000       NaN   
Varianza   0.259593     NaN    NaN       NaN   1.785798  170.867427       NaN   
IQR        0.362500     NaN    NaN       NaN   1.300000   18.600000       NaN   
CV %       0.013793     NaN    NaN       NaN   0.055681    0.497020       NaN   
CVM %      0.005414     NaN    NaN       NaN   0.025000    0.338403       NaN   

            Max1R13  
Media     35.596533  
Mediana   35.548750  
Moda      35.677500  
SD         0.574888  
MAD        0.310000  
Varianza   0.330496  
IQR        0.625000  
CV %       0.