In [None]:
# Montar Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Cambiar el directorio de trabajo
import os
import sys

# Define the TFM directory path
TFM_PATH = '/content/drive/My Drive/TFM'

# Change the current working directory to the TFM directory
os.chdir(TFM_PATH)
print(f"Current working directory changed to: {os.getcwd()}")

# Add the TFM directory to the Python system path
if TFM_PATH not in sys.path:
    sys.path.append(TFM_PATH)
    print(f"'{TFM_PATH}' added to Python system path.")
else:
    print(f"'{TFM_PATH}' is already in Python system path.")

In [2]:
# Carga de librerias
import extractData

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
import numpy as np
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [4]:
# Variables
data_path = TFM_PATH + '/data/'

participants_file = "selected_participants.csv"

In [43]:
participants_df = extractData.load_csv_to_dataframe(data_path,participants_file)


In [None]:
# Análisis de datos

# Mostrar como se distribuyen los datos por grupos de edad y sexo
def summary_histogram_base(df_data):
  age_order = ['21-29 years', '30-39 years', '40-49 years', '50-59 years', '60-69 years', '70-79 years', '80-89 years', '90-99 years']
  sex_order = ['Male', 'Female', 'No response']

  df_data['Year of birth'] = pd.Categorical(df_data['Year of birth'], categories=age_order, ordered=True)
  df_data['Sex/Gender'] = pd.Categorical(df_data['Sex/Gender'], categories=sex_order, ordered=True)

  grouped_data_cases = df_data.groupby(['Year of birth','Sex/Gender']).size().unstack(fill_value=0)
  print(grouped_data_cases)

  # Genera un histograma
  plt.figure(figsize=(10, 6))
  sns.countplot(data=df_data, x='Year of birth', hue='Sex/Gender', palette='viridis')
  plt.title('Distribución de grupos de edad por género para participantes con hipertensión')
  plt.xlabel('Grupo de edad')
  plt.ylabel('# participantes')
  plt.legend(title='Sexo/Genero')
  plt.xticks(rotation=45, ha='right')
  plt.tight_layout()
  plt.show()

participants_cases_df = participants_df[participants_df['Hypertension'] == 1]
summary_histogram_base(participants_cases_df)
participants_control_df = participants_df[participants_df['Hypertension'] == 0]
summary_histogram_base (participants_control_df)


In [None]:
# Muestra como se distribuyen los participantes por ancestros

def summary_histogram_ancestral_origin(df_data):
  selected_columns = [
      'Paternal grandmother: Country of origin',
      'Paternal grandfather: Country of origin',
      'Maternal grandmother: Country of origin',
      'Maternal grandfather: Country of origin'
  ]

  # 1. Crea un nuevo dataframe con los ancestros
  ancestral_origins_df =df_data[selected_columns].copy()

  # 2. Renombra columnas
  ancestral_origins_df.rename(columns={
      'Paternal grandmother: Country of origin': 'Paternal grandmother',
      'Paternal grandfather: Country of origin': 'Paternal grandfather',
      'Maternal grandmother: Country of origin': 'Maternal grandmother',
      'Maternal grandfather: Country of origin': 'Maternal grandfather'
  }, inplace=True)

  # 3. Transforma el dataset para poder dibujar el histograma
  ancestral_origins_melted_df = ancestral_origins_df.melt(
      var_name='Origin Type',
      value_name='Country of Origin'
  )

  # 4. Drop any rows where 'Country of Origin' is missing (NaN)
  #hypertensive_ancestral_origins_melted_df.dropna(subset=['Country of Origin'], inplace=True)
  type_order = ['Paternal grandmother', 'Paternal grandfather', 'Maternal grandmother', 'Maternal grandfather']
  country_order = ['North America', 'Central America', 'South America',
                   'Europe', 'South Asia', 'Western Asia', 'East Asia',
                   'Oceania', 'Africa']


  ancestral_origins_melted_df['Origin Type'] = pd.Categorical(ancestral_origins_melted_df['Origin Type'], categories=type_order, ordered=True)
  ancestral_origins_melted_df['Country of Origin'] = pd.Categorical(ancestral_origins_melted_df['Country of Origin'], categories=country_order, ordered=True)

  # Crea el histograma
  plt.figure(figsize=(15, 8))
  sns.countplot(data=ancestral_origins_melted_df, x='Country of Origin', hue='Origin Type', palette='viridis')
  plt.title('Distribución de ancestros por pacientes')
  plt.xlabel('País de origen')
  plt.ylabel('N9mero de participantes')
  plt.legend(title='Ancestría')
  plt.xticks(rotation=45, ha='right')
  plt.tight_layout()
  plt.show()

summary_histogram_ancestral_origin(participants_cases_df)
summary_histogram_ancestral_origin(participants_control_df)

In [None]:
def plot_ethnicity_distribution(df_data, column, title):
  plt.figure(figsize=(10, 6))
  sns.countplot(data=df_data, x=column, hue='Hypertension_label', palette='viridis')
  plt.title(title)
  plt.xlabel(column)
  plt.ylabel('Número de Participantes')
  plt.legend(title='Hipertensión')
  plt.xticks(rotation=45, ha='right')
  plt.tight_layout()
  plt.show()

# Create a temporary column with descriptive labels for Hypertension
plot_df = participants_df.copy()
plot_df['Hypertension_label'] = plot_df['Hypertension'].map({0: 'Control', 1: 'Casos'})

plot_ethnicity_distribution(plot_df, 'Race/ethnicity', 'Distribución por etnia (Casos vs Controles)')

In [None]:
# Create a temporary column with descriptive labels for Hypertension
plot_df = participants_df.copy()
plot_df['Hypertension_label'] = plot_df['Hypertension'].map({0: 'Control', 1: 'Casos'})

plot_ethnicity_distribution(plot_df, 'Blood_type','Distribución por tipo sanguíneo (Casos vs Controles)')

In [None]:
def plot_frec_histogram(df_data, column, title):
  # Sort the 'Weight' column values in ascending order for consistency
  sorted_data = df_data[column].sort_values().reset_index(drop=True)

  # Create a histogram for the sorted 'Weight' column with 10 bins
  plt.figure(figsize=(10, 6))
  sns.histplot(sorted_data, kde=True, bins=10, color='lightblue')

  # Set the title and labels
  plt.title(title)
  plt.xlabel(column)
  plt.ylabel('Frecuencia')

  # Display the plot
  plt.show()


# Call the function for participants_cases_df
plot_frec_histogram(participants_cases_df, 'Height_cm', 'Distribución de altura para los casos' )

# Call the function for participants_control_df
plot_frec_histogram(participants_control_df, 'Height_cm','Distribución de altura para controles')

In [None]:

# Call the function for participants_cases_df
plot_frec_histogram(participants_cases_df, 'Weight_kg','Distribución de peso para participantes con hipertensión')

# Call the function for participants_control_df
plot_frec_histogram(participants_control_df, 'Weight_kg', 'Distribución de peso para participantes de control')

In [None]:
def display_frequency_table(df_data, column_name, title):
  print(f"\n{title}:")
  # Get value counts, include NaN values, and sort them by frequency in descending order
  frequency_table = df_data[column_name].value_counts(dropna=False).sort_values(ascending=False)
  display(frequency_table.to_frame(name='Number of Participants'))

# Handedness tables
display_frequency_table(participants_cases_df, 'Handedness', 'Frecuencia de Lateralidad para Participantes con Hipertensión')
display_frequency_table(participants_control_df, 'Handedness', 'Frecuencia de Lateralidad para Participantes de Control')

# Right eye color tables
display_frequency_table(participants_cases_df, 'Right_eye_color', 'Frecuencia de Color de Ojo Derecho para Participantes con Hipertensión')
display_frequency_table(participants_control_df, 'Right_eye_color', 'Frecuencia de Color de Ojo Derecho para Participantes de Control')

# Left eye color tables
display_frequency_table(participants_cases_df, 'Left_eye_color', 'Frecuencia de Color de Ojo Izquierdo para Participantes con Hipertensión')
display_frequency_table(participants_control_df, 'Left_eye_color', 'Frecuencia de Color de Ojo Izquierdo para Participantes de Control')

# Hair color tables
display_frequency_table(participants_cases_df, 'Hair_color', 'Frecuencia de Color de Pelo para Participantes con Hipertensión')
display_frequency_table(participants_control_df, 'Hair_color', 'Frecuencia de Color de Pelo para Participantes de Control')

# Análisis Univariante

In [51]:
# Test de chi2 o Fiscjer para variables binarias

def chi2_or_fisher(df, x, y):
    """ Aplica chi2 o Fisher según corresponda """
    tbl = pd.crosstab(df[y], df[x])
    if (tbl.values < 5).sum() > 0:
        test = "Fisher"
        # usar Fisher exact (solo válido 2x2)
        if tbl.shape == (2,2):
            _, p = stats.fisher_exact(tbl)
        else:
            p = np.nan  # fisher no generalizado en scipy
    else:
        test = "Chi2"
        _, p, _, _ = stats.chi2_contingency(tbl)
    return test, p

# Regresión logística univariante
def logistic_univariate(df, y, x):
    """ Ajusta una regresión logística univariante y devuelve OR, IC y p-value """
    try:
        # Elimina valores nulos
        df2 = df[[y, x]].dropna()
        formula = f"{y} ~ {x}"
        # Aplica la regrisión logística
        model = smf.logit(formula, df2).fit(disp=False)
        coef = model.params[x]
        se = model.bse[x]
        # Extrae odd-ratio
        OR = np.exp(coef)
        # Calcula intervalos de confianza
        CI_low = np.exp(coef - 1.96*se)
        CI_high = np.exp(coef + 1.96*se)
        # Extrae el p-valur
        p = model.pvalues[x]
        return OR, CI_low, CI_high, p
    except:
        return np.nan, np.nan, np.nan, np.nan

# t-test o Mann-Whitney para variables continuas
def mannwhitney_or_ttest(df, x, y):
    """ Usa t-test o Mann–Whitney dependiendo de normalidad """
    # Eliminoa valores nulos
    d0 = df[df[y]==0][x].dropna()
    d1 = df[df[y]==1][x].dropna()

    # prueba simple de normalidad
    p_norm0 = stats.shapiro(d0)[1] if len(d0)>=3 else 0
    p_norm1 = stats.shapiro(d1)[1] if len(d1)>=3 else 0

    if p_norm0>0.05 and p_norm1>0.05:
        # t-test
        test = "t-test"
        p = stats.ttest_ind(d0, d1, equal_var=False)[1]
    else:
        # Mann-Whitney
        test = "Mann-Whitney"
        p = stats.mannwhitneyu(d0, d1, alternative="two-sided")[1]
    return test, p


In [None]:
# Nombre de la variable dependiente
target = "Hypertension"
cols_to_ignore = ["Timestamp_survey","Height","Weight","Left_eye_color","Right_eye_color",
                  "Hair_color","Handedness"]
excluded_cols = []

# Limpieza de filas
# Columna Sex/Gender. Eliminar todas las filas que no tienen sexo identificado
participants_df = participants_df[participants_df['Sex/Gender'].notna()]
participants_df = participants_df[participants_df['Sex/Gender'] != 'No response']

# ============================================
# Detección de tipo de variable a analizar
# ============================================

results = []

# Recorre una a una todas las variables para identificar aquellas que tiene que analizar
for col in participants_df.columns:
    # Si es la variable objetivo no la tiene en cuenta
    if (col == target or col in cols_to_ignore):
        continue

    series = participants_df[col]
    n_unique = series.nunique(dropna=True)

    # Skip IDs
    if "id" in col.lower() or "participant" in col.lower():
        continue

    # IDENTIFICACIÓN DE TIPO
    if n_unique == 1:
      excluded_cols.append(col)
      continue
    elif n_unique == 2:
        var_type = "binary"
    elif series.dtype == "object":
        var_type = "categorical"
    elif n_unique <= 10:
        var_type = "ordinal/category"
    else:
        var_type = "continuous"

    # Columna a analizar
    print(f"**** Columna a analizar ****** ----->  {col}")
    # ============================================
    # Aplicar el test según el tipo de variable
    # ============================================
    # Variables binarias
    if var_type == "binary":
        test, p_test = chi2_or_fisher(participants_df, col, target)
        OR, CI_low, CI_high, p_log = logistic_univariate(participants_df, target, col)
        results.append([col, var_type, test, p_test, OR, CI_low, CI_high, p_log])
    # Variables categóricas
    elif var_type == "categorical":
        test = "Chi2,LRT"
        try:
            tbl = pd.crosstab(participants_df[target], participants_df[col])
            _, p_test, _, _ = stats.chi2_contingency(tbl)
        except:
            test = "Chi2, Error"
            p_test = np.nan

        # Divide la columna en tantas columnas como categorias tenga
        dummies = pd.get_dummies(participants_df[col], prefix=col, drop_first=True)
        df_tmp = pd.concat([participants_df[[target]], dummies], axis=1).dropna()

        # LRT global
        try:
            full_formula = f"{target} ~ " + " + ".join(dummies.columns)
            null_formula = f"{target} ~ 1"
            model_full = smf.logit(full_formula, df_tmp).fit(disp=False)
            model_null = smf.logit(null_formula, df_tmp).fit(disp=False)
            LRT_p = stats.chi2.sf(2*(model_full.llf - model_null.llf),
                                   df_tmp.shape[1]-1)
        except:
            LRT_p = np.nan

        results.append([col, var_type, test, p_test, np.nan, np.nan, np.nan, LRT_p])
    elif var_type == "continuous":
        test, p_test = mannwhitney_or_ttest(participants_df, col, target)
        OR, CI_low, CI_high, p_log = logistic_univariate(participants_df, target, col)
        results.append([col, var_type, test, p_test, OR, CI_low, CI_high, p_log])

    else:  # ordinal / pocos niveles
        # tratar como ordinal: codificación automática
        test = "Regresión logística"
        try:
            series_ord = pd.factorize(series)[0]
            df2 = participants_df.copy()
            df2[col] = series_ord
            p_test = logistic_univariate(df2, target, col)[3]
            OR, CI_low, CI_high, p_log = logistic_univariate(df2, target, col)
        except:
            test = "Regresión logística, Error"
            p_test, OR, CI_low, CI_high, p_log = (np.nan,)*5

        results.append([col, var_type, test, p_test, OR, CI_low, CI_high, p_log])


columns = ["Variable", "Type", "test", "p_test", "OR", "CI_low", "CI_high", "p_logistic"]
results_df = pd.DataFrame(results, columns=columns)

# Orden por p_logistic
results_df = results_df.sort_values("p_logistic")

# Guardar
results_df.to_csv(data_path + "univariate_results.csv", index=False)

print(excluded_cols)
results_df



# Análisis Multivariante

In [53]:
# Variable dependiente
target = "Hypertension"

# Variables “forzadas” (clínicas)
forced_vars = [
    "Year of birth",      # categórica ordinal
    "Sex/Gender"          # categórica binaria (M/F)
]

candidate_vars = [
    "IMC",
    "endocrine_conditions",
    "Circulatory_conditions",
    "blood_conditions",
    "High cholesterol (hypercholesterolemia)",
    "type 2",
    "Diabetes mellitus"
]

all_vars = forced_vars + candidate_vars

# Subconjunto de datos con las columnas de interés
model_df = participants_df[[target] + all_vars].copy()

# Casos completos
model_df = model_df.dropna()

print("N casos completos:", model_df.shape[0])


N casos completos: 64


## Modelo logístico multivariante con statsmodels (interpretación detallada)

In [None]:
# ====================================
# CODIFICACIÓN DE CATEGÓRICAS
# Usamos fórmula de statsmodels que se encarga de las dummies
# ====================================

# Para evitar problemas con espacios y caracteres, usamos Q("col") en la fórmula
formula = (
    'Hypertension ~ '
    'C(Q("Year of birth")) + '
    'C(Q("Sex/Gender")) + '
    'IMC + '
    'Q("endocrine_conditions") + '
    'Q("Circulatory_conditions") + '
    'Q("blood_conditions") + '
    'Q("High cholesterol (hypercholesterolemia)") + '
    'Q("type 2") + '
    'Q("Diabetes mellitus")'
)

logit_model = smf.logit(formula=formula, data=model_df).fit()
print(logit_model.summary())

# ==========================
# OBTENER OR E INTERVALOS
# ==========================

params = logit_model.params
conf = logit_model.conf_int()
or_table = pd.DataFrame({
    "coef": params,
    "OR": np.exp(params),
    "CI_low": np.exp(conf[0]),
    "CI_high": np.exp(conf[1]),
    "p_value": logit_model.pvalues
})
print("\nOdds Ratios (modelo multivariante):\n")
print(or_table)


## Modelo penalizado (L1, tipo LASSO) con sklearn

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score

# ====================================
# SEPARAR X, y
# ====================================

y = model_df[target].values

# Definimos tipos de variables (ajusta según tu caso)
cat_features = ["Year of birth", "Sex/Gender"]
num_features = ["IMC", "endocrine_conditions", "Circulatory_conditions",
                "blood_conditions", "High cholesterol (hypercholesterolemia)", "type 2", "Diabetes mellitus"]

X = model_df[cat_features + num_features]

# ====================================
# TRANSFORMADORES: dummies + escalado
# ====================================



preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(drop="first"), cat_features),
        ("num", StandardScaler(), num_features),
    ]
)

# ====================================
# MODELO LOGÍSTICO PENALIZADO L1
# ====================================

log_reg_l1 = LogisticRegression(
    penalty="l1",
    solver="liblinear",
    max_iter=1000
)

pipeline = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", log_reg_l1)
])

# ====================================
# VALIDACIÓN CRUZADA (AUC)
# ====================================

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
auc_scores = cross_val_score(
    pipeline, X, y,
    cv=cv,
    scoring="roc_auc"
)

print("AUC media (5-fold CV):", auc_scores.mean())
print("Desviación estándar AUC:", auc_scores.std())

# ====================================
# AJUSTE FINAL EN TODO EL CONJUNTO
# ====================================

pipeline.fit(X, y)

# Puedes extraer coeficientes del modelo final:
# Primero transformamos X para ver cuántas columnas hay

X_trans = pipeline.named_steps["preprocess"].transform(X)
coef = pipeline.named_steps["model"].coef_[0]

print("Número de features tras one-hot y escalado:", X_trans.shape[1])
print("Coeficientes (modelo L1):")
print(coef)

## 2.4. Métricas adicionales: ROC y calibración (ejemplo sencillo)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

# Probabilidades predichas (modelo penalizado final)
y_pred_proba = pipeline.predict_proba(X)[:, 1]

# AUC
auc = roc_auc_score(y, y_pred_proba)
print("AUC (modelo final):", auc)

# Curva ROC
fpr, tpr, _ = roc_curve(y, y_pred_proba)
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], linestyle="--")
plt.xlabel("1 - Especificidad")
plt.ylabel("Sensibilidad")
plt.title("Curva ROC - Modelo multivariante")
plt.show()


# Análisis de componentes principales

In [None]:
# Seleccionamos solo columnas numéricas
df_num = participants_df.select_dtypes(include=['int64', 'float64']).copy()

# Eliminamos ID si está presente
cols_to_drop = [c for c in df_num.columns if 'id' in c.lower() or 'participant' in c.lower()]
df_num = df_num.drop(columns=cols_to_drop, errors='ignore')

# Eliminar columnas constantes
df_num = df_num.loc[:, df_num.nunique() > 1]

cols_to_ignore = ["Timestamp_survey","Height","Weight","Left_eye_color","Right_eye_color",
                  "Hair_color","Handedness"]

df_num = df_num.drop(columns=cols_to_ignore, errors='ignore')

# 1.Imputar los valores nulos del peso y la altura con los valores medios
mean_weight_kg = participants_df['Weight_kg'].mean()
mean_height_cm = participants_df['Height_cm'].mean()

df_num['Weight_kg'] = df_num['Weight_kg'].fillna(mean_weight_kg)
df_num['Height_cm'] = df_num['Height_cm'].fillna(mean_height_cm)

calculated_imc =mean_weight_kg / (mean_height_cm / 100) ** 2

df_num['IMC'] = df_num['IMC'].fillna(calculated_imc)

# Imputa el resto de nan con 0
df_num = df_num.fillna(0)

# Estandarizar variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_num)

# Calcula PCA
pca = PCA(n_components=20)
X_pca = pca.fit_transform(X_scaled)

# Varianza explicada
explained = pca.explained_variance_ratio_

for i, var in enumerate(explained, start=1):
    print(f"PC{i}: {var:.4f} varianza explicada")

print(f"Varianza acumulada: {np.cumsum(explained)}")

y = df_num["Hypertension"].loc[df_num.index]  # mismo índice

plt.figure(figsize=(7,5))
plt.scatter(X_pca[:,0], X_pca[:,1], c=y, cmap='coolwarm', alpha=0.7)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PCA coloreado por hipertensión")
plt.colorbar(label="Hypertension")
plt.show()

# Variables que contribuyen más a cada componente
loadings = pd.DataFrame(
    pca.components_.T,
    index=df_num.columns,
    columns=[f"PC{i}" for i in range(1, pca.n_components_ + 1)]
)

loadings.head()
print(loadings["PC1"].sort_values(ascending=False).head(10))
print(loadings["PC2"].sort_values(ascending=False).head(10))
print(loadings["PC3"].sort_values(ascending=False).head(10))


In [None]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np

def biplot(score, coeff, labels=None, y=None):
    xs = score[:,0] # first principal component scores
    ys = score[:,1] # second principal component scores
    n = 10 #coeff.shape[0] # number of variables

    # Plot the scores
    plt.figure(figsize=(10, 8))
    if y is not None:
        # Ensure y is a numpy array or Series for coloring
        if isinstance(y, pd.Series):
            y_values = y.values
        else:
            y_values = y
        plt.scatter(xs, ys, c=y_values, cmap='viridis', alpha=0.7)
        plt.colorbar(label='Hypertension')
    else:
        plt.scatter(xs, ys, alpha=0.7)

    # Plot the loadings
    for i in range(n):
        # Scale arrows to make them visible and proportional
        # Use max(abs(xs)) and max(abs(ys)) for better scaling if data is centered around 0
        scale_x = max(abs(xs)) * 0.7 if len(xs) > 0 else 1.0
        scale_y = max(abs(ys)) * 0.7 if len(ys) > 0 else 1.0

        plt.arrow(0, 0, coeff[i,0]*scale_x, coeff[i,1]*scale_y, color='r', alpha=0.7, head_width=0.05, head_length=0.05)
        if labels is not None:
            # Adjust text position for better readability
            plt.text(coeff[i,0]*scale_x*1.1, coeff[i,1]*scale_y*1.1, labels[i], color='g', ha='center', va='center')

    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.title("Biplot de PCA")
    plt.grid()
    plt.show()

# --- Re-execute data preparation and PCA from cell uijA-vmLSPc3 ---
# The pca and X_pca objects are assumed to be from the latest execution in uijA-vmLSPc3
# where pca was fitted on 'df_num' (167 columns, including 'Hypertension').

# The 'df_num' in the kernel state is the correct source for these feature names.
# 'feature_names' must have the same number of elements as 'coeff.shape[0]' (which is 167).
feature_names = df_num.columns.tolist()

# The 'y' variable for coloring should be 'Hypertension' from df_num
y_biplot = df_num['Hypertension']

# Call the biplot function with the corrected feature_names
biplot(X_pca, pca.components_.T, labels=feature_names, y=y_biplot)


In [None]:
# Seleccionamos solo columnas numéricas
df_num = participants_df.select_dtypes(include=['int64', 'float64']).copy()

# Eliminamos ID si está presente
cols_to_drop = [c for c in df_num.columns if 'id' in c.lower() or 'participant' in c.lower()]
df_num = df_num.drop(columns=cols_to_drop, errors='ignore')


# Eliminar columnas constantes
df_num = df_num.loc[:, df_num.nunique() > 1]

cols_to_ignore = ["Timestamp_survey","Height","Weight","Left_eye_color","Right_eye_color",
                  "Hair_color","Handedness"]

df_num = df_num.drop(columns=cols_to_ignore, errors='ignore')

# Identify columns ending with '_conditions'
columns_to_drop_conditions = [col for col in df_num.columns if col.endswith('_conditions')]

# Drop these columns from df_num
df_num = df_num.drop(columns=columns_to_drop_conditions, errors='ignore')

# 1.Imputar los valores nulos del peso y la altura con los valores medios
mean_weight_kg = participants_df['Weight_kg'].mean()
mean_height_cm = participants_df['Height_cm'].mean()

df_num['Weight_kg'] = df_num['Weight_kg'].fillna(mean_weight_kg)
df_num['Height_cm'] = df_num['Height_cm'].fillna(mean_height_cm)

calculated_imc =mean_weight_kg / (mean_height_cm / 100) ** 2

df_num['IMC'] = participants_df['IMC'].fillna(calculated_imc)

# Imputa el resto de nan con 0
participants_df = participants_df.fillna(0)

# Estandarizar variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_num)

# Calcula PCA
pca = PCA(n_components=20)
X_pca = pca.fit_transform(X_scaled)

# Varianza explicada
explained = pca.explained_variance_ratio_

for i, var in enumerate(explained, start=1):
    print(f"PC{i}: {var:.4f} varianza explicada")

print(f"Varianza acumulada: {np.cumsum(explained)}")

y = df_num["Hypertension"].loc[df_num.index]  # mismo índice

plt.figure(figsize=(7,5))
plt.scatter(X_pca[:,0], X_pca[:,1], c=y, cmap='coolwarm', alpha=0.7)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PCA coloreado por hipertensión")
plt.colorbar(label="Hypertension")
plt.show()

# Variables que contribuyen más a cada componente
loadings = pd.DataFrame(
    pca.components_.T,
    index=df_num.columns,
    columns=[f"PC{i}" for i in range(1, pca.n_components_ + 1)]
)

# Call the biplot function
# Ensure `df_selected.columns` excludes 'Hypertension' for feature names for loadings
feature_names = df_num.drop(columns=['Hypertension']).columns
biplot(X_pca, pca.components_.T, labels=feature_names, y=df_num['Hypertension'])

loadings.head()
print(loadings["PC1"].sort_values(ascending=False).head(10))
print(loadings["PC2"].sort_values(ascending=False).head(10))
print(loadings["PC3"].sort_values(ascending=False).head(10))


In [None]:
# Initialize a list with the base columns
desired_cols = ['Height_cm', 'Weight_kg','Hypertension']

# Add all columns ending with '_conditions' from participants_df
for col in participants_df.columns:
    if col.endswith('_conditions'):
        desired_cols.append(col)

# Create the new DataFrame
df_selected = participants_df[desired_cols].copy()

# 1.Imputar los valores nulos del peso y la altura con los valores medios
mean_weight_kg = participants_df['Weight_kg'].mean()
mean_height_cm = participants_df['Height_cm'].mean()

df_selected['Weight_kg'] = df_selected['Weight_kg'].fillna(mean_weight_kg)
df_selected['Height_cm'] = df_selected['Height_cm'].fillna(mean_height_cm)

calculated_imc =mean_weight_kg / (mean_height_cm / 100) ** 2

df_selected['IMC'] = participants_df['IMC'].fillna(calculated_imc)

# Imputa el resto de nan con 0
df_selected = df_selected.fillna(0)

# Estandarizar variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_selected)

# Calcula PCA
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_scaled)

# Varianza explicada
explained = pca.explained_variance_ratio_

for i, var in enumerate(explained, start=1):
    print(f"PC{i}: {var:.4f} varianza explicada")

print(f"Varianza acumulada: {np.cumsum(explained)}")

y = df_selected["Hypertension"].loc[df_num.index]  # mismo índice

plt.figure(figsize=(7,5))
plt.scatter(X_pca[:,0], X_pca[:,1], c=y, cmap='coolwarm', alpha=0.7)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PCA coloreado por hipertensión")
plt.colorbar(label="Hypertension")
plt.show()

# Variables que contribuyen más a cada componente
loadings = pd.DataFrame(
    pca.components_.T,
    index=df_selected.columns,
    columns=[f"PC{i}" for i in range(1, pca.n_components_ + 1)]
)

# Call the biplot function
# Ensure `df_selected.columns` excludes 'Hypertension' for feature names for loadings
feature_names = df_selected.drop(columns=['Hypertension']).columns
biplot(X_pca, pca.components_.T, labels=feature_names, y=df_selected['Hypertension'])

loadings.head()
print(loadings["PC1"].sort_values(ascending=False).head(10))
print(loadings["PC2"].sort_values(ascending=False).head(10))
print(loadings["PC3"].sort_values(ascending=False).head(10))
