In [None]:
# Montar Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Cambiar el directorio de trabajo
import os
import sys

# Define the TFM directory path
TFM_PATH = '/content/drive/My Drive/TFM'

# Change the current working directory to the TFM directory
os.chdir(TFM_PATH)
print(f"Current working directory changed to: {os.getcwd()}")

# Add the TFM directory to the Python system path
if TFM_PATH not in sys.path:
    sys.path.append(TFM_PATH)
    print(f"'{TFM_PATH}' added to Python system path.")
else:
    print(f"'{TFM_PATH}' is already in Python system path.")

In [None]:
# Importar Librerias
import extractData

import pandas as pd
import numpy as np
import requests

In [None]:
# Definición de variables
data_path = TFM_PATH + '/data/'
survey_path = 'surveys/'
participant_data_path = 'participant_genetic_data'

participants_hypertension_file = "participantes_hipertension.csv"
participants_control_file = "participantes_control.csv"
metadata_file = "public_genetic_data.csv"

participants_generalsurvey_file = "PGPParticipantSurvey-20181010220019.csv"
participants_basicPhenoSurvey_file = 'PGPBasicPhenotypesSurvey2015-20181010214636.csv'
participants_circulatory_file = "PGPTrait&DiseaseSurvey2012_CirculatorySystem-20181010220109.csv"
participants_cancer_file = "PGPTrait&DiseaseSurvey2012_Cancers-20181010220037.csv"
participants_endocrine_file = 'PGPTrait&DiseaseSurvey2012_Endocrine,Metabolic,Nutritional,AndImmunity-20181010220044.csv'
participants_blood_file ='PGPTrait&DiseaseSurvey2012_Blood-20181010220050.csv'
participants_nervous_file  = 'PGPTrait&DiseaseSurvey2012_NervousSystem-20181010220056.csv'
participants_senses_file = 'PGPTrait&DiseaseSurvey2012_VisionAndHearing-20181010220103.csv'
participants_respiratory_file = 'PGPTrait&DiseaseSurvey2012_RespiratorySystem-20181010220114.csv'
participants_digestive_file = 'PGPTrait&DiseaseSurvey2012_DigestiveSystem-20181010214607.csv'
participants_genitourinary_file = 'PGPTrait&DiseaseSurvey2012_GenitourinarySystems-20181010214612.csv'
participants_skin_file = 'PGPTrait&DiseaseSurvey2012_SkinAndSubcutaneousTissue-20181010214618.csv'
participants_muscle_file = 'PGPTrait&DiseaseSurvey2012_MusculoskeletalSystemAndConnectiveTissue-20181010214624.csv'
participants_congenital_file = 'PGPTrait&DiseaseSurvey2012_CongenitalTraitsAndAnomalies-20181010214629.csv'

col_name_participant = "Participant"
col_name_conditions = "Have you ever been diagnosed with one of the following conditions?"
sep = ','


In [None]:
# Función para calcular y agrupar la edad
def calculate_and_group_age(row):
    year_of_birth_val = row['Year of birth']
    timestamp_year = row['Timestamp'].year

    if pd.isna(year_of_birth_val):
        return np.nan

    # Si ya es un rango de edad, se devuelve tal cual
    if 'years' in str(year_of_birth_val) or '-' in str(year_of_birth_val):
        return year_of_birth_val

    try:
        birth_year = int(year_of_birth_val)
        age = timestamp_year - birth_year

        if 21 <= age <= 29:
            return '21-29 years'
        elif 30 <= age <= 39:
            return '30-39 years'
        elif 40 <= age <= 49:
            return '40-49 years'
        elif 50 <= age <= 59:
            return '50-59 years'
        elif 60 <= age <= 69:
            return '60-69 years'
        elif 70 <= age <= 79:
            return '70-79 years'
        elif 80 <= age <= 89:
            return '80-89 years'
        elif 90 <= age <= 99:
            return '90-99 years'
        elif age < 21:
            return '<21 years'
        else:
            return '100+ years'

    except ValueError:
        # Si no es un año numérico ni un rango de edad, se devuelve el valor original
        return year_of_birth_val


In [None]:
# Clasificar los ancestros según el país de Origen
def classify_ancestry_by_country(country):
    if pd.isna(country):
        return np.nan
    country = str(country).strip()
    if country in ['United States', 'Canada','United States Minor Outlying Islands']:
        return 'North America'
    elif country in ['Spain', 'Poland', 'Austria', 'Lithuania', 'Belarus', 'Aland Islands', 'Italy', 'Hungary', 'United Kingdom', 'Germany',
                     'Latvia', 'France', 'Russian Federation', 'Czech Republic', 'Ukraine', 'Estonia', 'Netherlands', 'Ireland', 'Denmark', 'Norway',
                     'Sweden','Bulgaria','Serbia','Macedonia, The Former Yugoslav Republic Of','Portugal','Iceland','Belgium','Switzerland','Greece',
                     'Finland', 'Slovenia','Slovakia','Isle of Man','Georgia','Romania','Croatia']:
        return 'Europe'
    elif country in ['India','Sri Lanka','Pakistan','Afghanistan']:
        return 'South Asia'
    elif country in ['Honduras', 'Mexico','Puerto Rico','Trinidad and Tobago']:
        return 'Central America'
    elif country in ['Brazil','Peru','Colombia','Venezuela','Argentina']:
        return 'South America'
    elif country in ['Australia', 'New Zealand', 'British Indian Ocean Territory']:
        return 'Oceania'
    elif country in ['Turkey', 'Armenia','Lebanon','Syrian Arab Republic','Israel']:
        return 'Western Asia'
    elif country in ['China','Japan']:
        return 'East Asia'
    elif country in ['Viet Nam', 'Philippines']:
        return 'Southeast Asia'
    elif country in ['Ethiopia','Morocco','Tanzania, United Republic of']:
        return 'Africa'
    else:
        return country

In [None]:
def convert_height_to_cm(height_str):
    if pd.isna(height_str):
        return np.nan

    # Convertir a string para asegurar el procesamiento
    height_str = str(height_str).strip()

    # Verificar si el formato es 'X'Y"'
    if "'" in height_str and '"' in height_str:
        try:
            parts = height_str.split("'")
            feet = int(parts[0])
            inches_str = parts[1].replace('"', '').strip()
            inches = int(inches_str) if inches_str else 0

            total_inches = (feet * 12) + inches
            cm = total_inches * 2.54
            return round(cm, 2)
        except ValueError:
            # Si hay un error en la conversión, devolver NaN
            return np.nan
    else:
        # Si no tiene el formato esperado, intentar convertir directamente a numérico
        try:
            # Intenta convertir a número, si ya está en cm o un formato diferente
            return float(height_str)
        except ValueError:
            # Si no se puede convertir, devuelve el valor original o NaN
            return np.nan

In [None]:
def convert_lbs_to_kg(lbs_value):
    if pd.isna(lbs_value):
        return np.nan
    try:
        # 1 libra = 0.453592 kilogramos
        return float(lbs_value) * 0.453592
    except ValueError:
        return np.nan


In [None]:
def extractData_from_generalSurvey(data_path, data_file, generalsurvey_file,  col_name_id, selected_columns):
    # Carga Datos de pacientes extraidos de CargaDatos.ipynb
    df_participants = extractData.load_csv_to_dataframe(data_path, data_file)
    participant_list = df_participants["participant_id"].unique().tolist()

    # Carga los datos del fichero general de pacientes
    df_data = extractData.load_csv_to_dataframe(data_path, generalsurvey_file)

    # Convertir 'Timestamp' a formato de fecha y hora
    df_data['Timestamp'] = pd.to_datetime(df_data['Timestamp'])

    # Filtrar datos por participante en la lista
    df_data_filtered = df_data[df_data[col_name_id].isin(participant_list)].copy()

    # Ordernar los participantes por fechas descendiente
    df_data_sorted = df_data_filtered.sort_values(by=[col_name_id, 'Timestamp'], ascending=[True, False])

    # Borra duplicados, manteniendo el más reciente de los participantes
    df_data_unique_recent = df_data_sorted.drop_duplicates(subset=[col_name_id], keep='first')

    df_data = df_data_unique_recent.loc[:, selected_columns]

    # Aplicar la función a la columna 'Year of birth'
    df_data['Year of birth'] = df_data.apply(calculate_and_group_age, axis=1)

    # Clasificación de ancestros por areas según el país de origen
    ancestry_country_columns = [
        'Maternal grandmother: Country of origin',
        'Maternal grandfather: Country of origin',
        'Paternal grandmother: Country of origin',
        'Paternal grandfather: Country of origin'
    ]

    for col in ancestry_country_columns:
        df_data[col] = df_data[col].apply(classify_ancestry_by_country)

    return df_data, participant_list

def extractSummary_dataframe(df_data,participant_list, col_name_id,selected_columns):

    print(f"Número de participantes: {len(df_data[col_name_id])}")

    print("\n--- Análisis de valores únicos por columna ---")
    columns_to_analyze = [col for col in selected_columns if col not in ['Participant', 'Timestamp']]

    for column in columns_to_analyze:
        print(f"\nColumna: {column}")
        print(f"Número de valores únicos: {df_data[column].nunique()}")
        print("Conteo de valores:")
        print(df_data[column].value_counts(dropna=False)) # dropna=False to include NaN counts
        print("-" * 30)

    # Identify participants in participant_list but not in df_data
    participants_in_df_data = set(df_data[col_name_participant].unique())
    participants_not_in_df = [p for p in participant_list if p not in participants_in_df_data]

    print(f"\n--- Participantes en la lista inicial pero no en el DataFrame final ({len(participants_not_in_df)}) ---")
    if participants_not_in_df:
        for p in participants_not_in_df:
            print(p)
    else:
        print("Todos los participantes de la lista inicial están presentes en el DataFrame final.")


In [None]:
selected_columns = ['Participant', 'Timestamp','Year of birth','Severe disease or rare genetic trait',
                    'Sex/Gender', 'Race/ethnicity',
                    'Maternal grandmother: Country of origin','Maternal grandfather: Country of origin',
                    'Paternal grandmother: Country of origin', 'Paternal grandfather: Country of origin']

# Crea el conjunto de datos inicial
df_data_cases, participant_list = extractData_from_generalSurvey(data_path,participants_hypertension_file,survey_path + participants_generalsurvey_file,
                                                                 col_name_participant,selected_columns)

# extractSummary_dataframe(df_data_cases,participant_list,col_name_participant,selected_columns)

In [None]:
# Crea el conjunto de datos inicial de control
df_data_control, participant_control_list = extractData_from_generalSurvey(data_path,participants_control_file,survey_path + participants_generalsurvey_file,
                                                                            col_name_participant,selected_columns)

# extractSummary_dataframe(df_data_control,participant_control_list,col_name_participant,selected_columns)


In [None]:
def balance_control_group(df_cases, df_control, grouped_cases_distribution):
    balanced_control_samples = []

    # Iterate through each demographic group in the cases distribution
    for (year_of_birth, sex_gender), target_count in grouped_cases_distribution.stack().items():
        if target_count == 0: # Skip if no cases in this demographic
            continue

        # Filter the control group for the current demographic
        current_demographic_controls = df_control[
            (df_control['Year of birth'] == year_of_birth) &
            (df_control['Sex/Gender'] == sex_gender)
        ]

        num_available_controls = len(current_demographic_controls)

        if num_available_controls >= target_count:
            # If enough controls, sample randomly
            sampled_controls = current_demographic_controls.sample(n=target_count, random_state=42)
            balanced_control_samples.append(sampled_controls)
        elif num_available_controls > 0:
            # If not enough, take all available controls for this demographic
            balanced_control_samples.append(current_demographic_controls)
        # If num_available_controls is 0, do nothing (no controls for this demographic)

    # Concatenate all sampled DataFrames into a single DataFrame
    if balanced_control_samples:
        balanced_df_control = pd.concat(balanced_control_samples, ignore_index=True)
    else:
        balanced_df_control = pd.DataFrame(columns=df_control.columns)

    return balanced_df_control


In [None]:
# Grupo de datos de casos
grouped_data_cases = df_data_cases.groupby(['Year of birth','Sex/Gender']).size().unstack(fill_value=0)
print(grouped_data_cases)

grouped_data_control = df_data_control.groupby(['Year of birth','Sex/Gender']).size().unstack(fill_value=0)
print(grouped_data_control)

# Extrae del dataset de control un conjunto de datos balanceado
balanced_df_control = balance_control_group(df_data_cases, df_data_control, grouped_data_cases)


In [None]:
grouped_balanced_control = balanced_df_control.groupby(['Year of birth','Sex/Gender']).size().unstack(fill_value=0)

participant_control_list = balanced_df_control["Participant"].unique().tolist()
extractSummary_dataframe(balanced_df_control,participant_control_list,col_name_participant,selected_columns)

print("Demographic distribution of the balanced control group:")
print(grouped_balanced_control)


In [None]:
# Extrae la información del resto de datasets

# Fenotipos
df_data = extractData.load_csv_to_dataframe(data_path, survey_path + participants_basicPhenoSurvey_file)

# Seleccionar las columnas y transformar los datos
selected_columns = ['Participant', 'Timestamp','1.1 — Blood Type','1.2 — Height', '1.3 — Weight',
                    '2.3 — Left Eye Color - Text Description', '2.4 — Right Eye Color - Text Description',
                    '3.1 — What is your natural hair color currently, when without artificial color or dye?',
                    '1.4 — Handedness']

df_data = df_data.loc[:, selected_columns]

# Renombrar columnas para hacerlas más sencillas
df_data = df_data.rename(columns={
    '1.1 — Blood Type': 'Blood_type',
    '1.2 — Height': 'Height',
    '1.3 — Weight': 'Weight',
    '2.3 — Left Eye Color - Text Description': 'Left_eye_color',
    '2.4 — Right Eye Color - Text Description': 'Right_eye_color',
    '3.1 — What is your natural hair color currently, when without artificial color or dye?': 'Hair_color',
    '1.4 — Handedness': 'Handedness'
})

# Reemplazar same del color derecho por el valor del ojeo izquerdo
df_data.loc[df_data['Right_eye_color'].str.lower() == 'same', 'Right_eye_color'] = df_data['Left_eye_color']

# Convertir la altura de pies y pulgadas a cm
df_data['Height_cm'] = df_data.apply(lambda row: convert_height_to_cm(row['Height']), axis=1)
# Convertir el peso en libras a kg
df_data['Weight_kg'] = df_data.apply(lambda row: convert_lbs_to_kg(row['Weight']), axis=1)

#Calcula el índice de masa corporal
df_data['IMC'] = df_data['Weight_kg'] / ((df_data['Height_cm'] / 100)**2)

df_data.loc[:,['Height','Height_cm',"Weight","Weight_kg"]]

In [None]:
### Unir los datos de fenotipos básicos al conjunto de datos de general y al conjunto de datos de control

# Asegurarse de que 'Timestamp' es de tipo datetime en df_data (ya debería estarlo, pero para seguridad)
df_data['Timestamp'] = pd.to_datetime(df_data['Timestamp'])

# Ordernar df_data por participante y fecha descendente para quedarnos con el más reciente
df_data_sorted = df_data.sort_values(by=['Participant', 'Timestamp'], ascending=[True, False])

# Borrar duplicados, manteniendo el más reciente de cada participante
df_data_unique_recent = df_data_sorted.drop_duplicates(subset=['Participant'], keep='first')

# Unir df_data_cases con df_data_unique_recent
# Usamos un 'left merge' para asegurarnos de que todos los participantes de df_data_cases se mantengan
# y las columnas no coincidentes de df_data se llenen con NaN.
df_data_cases = pd.merge(df_data_cases, df_data_unique_recent, on='Participant', how='left', suffixes=('_survey', '_pheno'))

# Opcional: Eliminar la columna 'Timestamp_pheno' si no es necesaria para el análisis posterior
df_data_cases = df_data_cases.drop(columns=['Timestamp_pheno'])

# Unir balanced_df_control con df_data_unique_recent
# Usamos un 'left merge' para asegurarnos de que todos los participantes de balanced_df_control se mantengan
# y las columnas no coincidentes de balanced_df_control se llenen con NaN.
balanced_df_control = pd.merge(balanced_df_control, df_data_unique_recent, on='Participant', how='left', suffixes=('_survey', '_pheno'))


# Opcional: Eliminar la columna 'Timestamp_pheno' si no es necesaria para el análisis posterior
balanced_df_control = balanced_df_control.drop(columns=['Timestamp_pheno'])



In [None]:
from os.path import join
def join_datasets(df_data_cases, balanced_df_control, folder_name, file_name, col_id, col_name_conditions, sep, summary_column):
    # Extract Data from Excel
    df_data = extractData.load_csv_to_dataframe(folder_name, file_name)

    # Extraer todas las condiciones del fichero
    all_cv_conditions = extractData.extract_conditions(df_data,col_name_conditions,sep)

    print (f" Hay {len(all_cv_conditions)} para {summary_column}")
    print(all_cv_conditions)
    n = len(all_cv_conditions)

    # Crear dataset de participantes
    df_data = extractData.create_conditions_df(df_data, all_cv_conditions,col_id,col_name_conditions,sep)

    # Añadir una columna que sume todas las condiciones
    has_conditions = df_data[all_cv_conditions].sum(axis=1)

    # ¿Debería no considerar hemorroides?
    # Se elimina la hipertensión
    if (summary_column == 'Circulatory_conditions'):
      has_conditions = df_data[all_cv_conditions].sum(axis=1) - df_data['Hypertension']

    df_data[summary_column] = has_conditions

    # Añade estas columnas al conjunto de datos de casos y de control
    df_cases = pd.merge(df_data_cases, df_data, on='Participant', how='left', suffixes=('_cases', '_condition'))
    df_control = pd.merge(balanced_df_control, df_data, on='Participant', how='left', suffixes=('_control', '_condition'))

    return df_cases, df_control, n


In [None]:
def summarize_conditions(df_data):
    condition_columns = [col for col in df_data_cases.columns if col.endswith('_conditions')]
    columns = ['condition_type', 'has_condition', 'no_condition']
    conditions_df = pd.DataFrame(columns=columns)

    for col in condition_columns:
        has_condition = (df_data[col] > 0).sum() # Count where condition is present (value > 0)
        no_condition = (df_data[col] == 0).sum() # Count where condition is not present (value == 0)
        # Also account for NaN values as no condition
        no_condition += df_data[col].isna().sum()

        new_row =pd.DataFrame([{"condition_type": col,"has_condition": has_condition,"no_condition":no_condition}])
        conditions_df = pd.concat([conditions_df, new_row], ignore_index=True)

    return conditions_df


In [None]:
# TODO. Ejecutar las filas anteriores o inicializar los dataframes
columns = ['condition_type', 'N']
conditions_df = pd.DataFrame(columns=columns)

col_name_conditions_one = 'Have you ever been diagnosed with one of the following conditions?'
col_name_conditions_any = 'Have you ever been diagnosed with any of the following conditions?'

# Unir los datos relacionados con enfermedades cardiovasculares
(df_data_cases, balanced_df_control,n_conditions) = join_datasets(df_data_cases, balanced_df_control, data_path + survey_path, participants_circulatory_file,
                                                     col_name_participant, col_name_conditions_one, sep, 'Circulatory_conditions')
new_row =pd.DataFrame([{"condition_type": "Sistema circulatorio","N": n_conditions}])
conditions_df = pd.concat([conditions_df, new_row], ignore_index=True)

# Unir los datos relacionados con cancer
(df_data_cases, balanced_df_control, n_conditions) = join_datasets(df_data_cases, balanced_df_control, data_path + survey_path, participants_cancer_file,
                                                     col_name_participant, col_name_conditions_one, sep, 'cancer_conditions')
new_row =pd.DataFrame([{"condition_type": "Cáncer","N": n_conditions}])
conditions_df = pd.concat([conditions_df, new_row], ignore_index=True)

# Unir los datos relacionados con el sistema endocrino
(df_data_cases, balanced_df_control,n_conditions) = join_datasets(df_data_cases, balanced_df_control, data_path + survey_path, participants_endocrine_file,
                                                     col_name_participant, col_name_conditions_any, sep, 'endocrine_conditions')
new_row =pd.DataFrame([{"condition_type": "Sistema endocrino","N": n_conditions}])
conditions_df = pd.concat([conditions_df, new_row], ignore_index=True)

# Unir los datos relacionados con enfermedades de la sangre
(df_data_cases, balanced_df_control,n_conditions) = join_datasets(df_data_cases, balanced_df_control, data_path + survey_path, participants_blood_file,
                                                     col_name_participant, col_name_conditions_any, sep, 'blood_conditions')
new_row =pd.DataFrame([{"condition_type": "Sistema sanguineo","N": n_conditions}])
conditions_df = pd.concat([conditions_df, new_row], ignore_index=True)

# Unir los datos relacionados con el sistema nervioso
(df_data_cases, balanced_df_control,n_conditions) = join_datasets(df_data_cases, balanced_df_control, data_path + survey_path, participants_nervous_file,
                                                     col_name_participant, col_name_conditions_one, sep, 'nervous_conditions')
new_row =pd.DataFrame([{"condition_type": "Sistema nervioso","N": n_conditions}])
conditions_df = pd.concat([conditions_df, new_row], ignore_index=True)

# Unir los datos relacionados con los sentidos de la vista y el oido
(df_data_cases, balanced_df_control,n_conditions) = join_datasets(df_data_cases, balanced_df_control, data_path + survey_path, participants_senses_file,
                                                     col_name_participant, col_name_conditions_one, sep, 'senses_conditions')
new_row =pd.DataFrame([{"condition_type": "Sistema Visual y auditivo","N": n_conditions}])
conditions_df = pd.concat([conditions_df, new_row], ignore_index=True)

# Unir los datos relacionados con el sistema respiratorio
(df_data_cases, balanced_df_control,n_conditions) = join_datasets(df_data_cases, balanced_df_control, data_path + survey_path, participants_respiratory_file,
                                                     col_name_participant, col_name_conditions_any, sep, 'respiratory_conditions')
new_row =pd.DataFrame([{"condition_type": "Sistema respiratorio","N": n_conditions}])
conditions_df = pd.concat([conditions_df, new_row], ignore_index=True)

# Unir los datos relacionados con el sistema digestivo
(df_data_cases, balanced_df_control, n_conditions) = join_datasets(df_data_cases, balanced_df_control, data_path + survey_path, participants_digestive_file,
                                                     col_name_participant, col_name_conditions_any, sep, 'digestive_conditions')
new_row =pd.DataFrame([{"condition_type": "Sistema digestivo","N": n_conditions}])
conditions_df = pd.concat([conditions_df, new_row], ignore_index=True)

# Unir los datos relacionados con el sistema reproductor
(df_data_cases, balanced_df_control,n_conditions) = join_datasets(df_data_cases, balanced_df_control, data_path + survey_path, participants_genitourinary_file,
                                                     col_name_participant, col_name_conditions_any, sep, 'genitourinary_conditions')
new_row =pd.DataFrame([{"condition_type": "Sistema genitourinario","N": n_conditions}])
conditions_df = pd.concat([conditions_df, new_row], ignore_index=True)

# Unir los datos relacionados con enferemedades de la piel
(df_data_cases, balanced_df_control,n_conditions) = join_datasets(df_data_cases, balanced_df_control, data_path + survey_path, participants_skin_file,
                                                     col_name_participant, col_name_conditions_any, sep, 'skin_conditions')
new_row =pd.DataFrame([{"condition_type": "Piel","N": n_conditions}])
conditions_df = pd.concat([conditions_df, new_row], ignore_index=True)

# Unir los datos relacionados con enferemedades musculo-esqueléticas
# Corrected col_name_conditions for participants_muscle_file
(df_data_cases, balanced_df_control,n_conditions) = join_datasets(df_data_cases, balanced_df_control, data_path + survey_path, participants_muscle_file,
                                                     col_name_participant, col_name_conditions_any, sep, 'muscular_conditions')
new_row =pd.DataFrame([{"condition_type": "Sistema musculo esquelético","N": n_conditions}])
conditions_df = pd.concat([conditions_df, new_row], ignore_index=True)

# Unir los datos relacionados con rasgos congénitos
# Assuming 'one of the following conditions' for congenital traits
(df_data_cases, balanced_df_control,n_conditions) = join_datasets(df_data_cases, balanced_df_control, data_path + survey_path, participants_congenital_file,
                                                     col_name_participant, col_name_conditions_any, sep, 'congenital_conditions')
new_row =pd.DataFrame([{"condition_type": "Rasgos congénitos","N": n_conditions}])
conditions_df = pd.concat([conditions_df, new_row], ignore_index=True)


print(f"Number of rows in df_data_cases: {df_data_cases.shape[0]}")
print(f"Number of columns in df_data_cases: {df_data_cases.shape[1]}")

print(f"Number of rows in balanced_df_control: {balanced_df_control.shape[0]}")
print(f"Number of columns in balanced_df_control: {balanced_df_control.shape[1]}")

print(conditions_df)

df_sum_condition_cases = summarize_conditions(df_data_cases)
df_sum_condition_control = summarize_conditions(balanced_df_control)

print(df_sum_condition_cases)
print(df_sum_condition_control)


df_participants = pd.concat([df_data_cases, balanced_df_control], axis=0)

df_participants.to_csv(data_path +'selected_participants.csv', sep=',', index=False)

In [None]:
import subprocess
import os

def search_original_file(participant_list, metadata_df, max_file_size, download):
  data_records = []
  ## Recorre los participantes de la lista
  for participant_id in participant_list:
      if download:
            participant_dir = os.path.join(data_path + participant_data_path, participant_id)
            os.makedirs(participant_dir, exist_ok=True)
            print(f"Ensured directory exists: {participant_dir}")

      print(f"##### Participant: {participant_id} #####")

      #Busca en el dataset de metadatos los registros que corresponden al participante
      metadata_df_participant = metadata_df[metadata_df['participant_id'] == participant_id]

      #Recorre cada uno de los registros encontrados y busca la URL original (siempre y cuando el tamaño sea menor de 11 GB)
      for index,record in metadata_df_participant.iterrows():
          # Comprueba el tamaño del fichero
          print(f"Tamaño del fichero {record['file_size']}")
          if record['file_size_unit']=='GB' and record['file_size_value'] > max_file_size:
              record['original_url'] = "Sin verificar - Tamaño excesivo"
              record['filename'] = ''
              data_records.append(record)
              continue

          print(f"Download url: {record['download_url']}")
          if pd.isna(record['download_url']):
              record['original_url'] = "Sin archivo para descargar"
              record['filename'] = ''
              data_records.append(record)
          else:
              try:
                  final_url, filename, soup = extractData.get_html_parser(record['download_url'])
                  print(f"Final url: {final_url}")
                  print(f"Filename: {filename}")

                  if not soup:
                      download_file = []
                  else:
                      download_file = extractData.get_list_genetic_data(soup, final_url)

                  if len(download_file) == 0:
                      record['original_url'] = final_url
                      record['filename'] = filename
                      data_records.append(record)
                      ext = filename.split('.')
                      if download and ext[-1].strip().upper() != 'BAM':
                          print(f"Descargando... {final_url}")
                          extractData.get_download_file(final_url,participant_dir,filename)
                  else:
                      for url_file in download_file:
                          print(f"Descargando... {url_file}")
                          record['original_url'] = url_file
                          record['filename'] = ''
                          data_records.append(record)
                          if download_file and ext[-1].strip().upper() != 'BAM':
                            print(f"Descargando... {url_file}")
                            extractData.get_download_file(url_file,participant_dir,None)


              except requests.exceptions.HTTPError as errh:
                  print(f"HTTP Error for {record['participant_id']}: {errh}")
              except requests.exceptions.ConnectionError as errc:
                  print(f"Error Connecting for {record['participant_id']}: {errc}")
              except requests.exceptions.Timeout as errt:
                  print(f"Timeout Error for {record['participant_id']}: {errt}")
              except requests.exceptions.RequestException as err:
                  print(f"An unexpected error occurred for {record['participant_id']}: {err}")
              except Exception as e:
                  print(f"An unexpected error occurred searching URL: {e}")

  return pd.DataFrame(data_records)

# Inicialización de variables
max_file_size = 8
participant_list = df_participants['Participant'].to_list()
# Carga el fichero de metadatos generado en CargaDatos
metadata_df = extractData.load_csv_to_dataframe(data_path, metadata_file)

# ['hu034DB1', 'hu05FD49', 'hu0878AF', 'hu094BE5', 'hu0D1FA1']
#participant_list = ['hu05FD49']
selected_participants_metadata_df = search_original_file(participant_list, metadata_df, max_file_size,False)

print(selected_participants_metadata_df.shape)

selected_participants_metadata_df.to_csv(data_path +'metadatos_geneticos_participantes.csv', sep=',', index=False)

