In [None]:
# Montar Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Cambiar el directorio de trabajo
import os
import sys

# Define the TFM directory path
TFM_PATH = '/content/drive/My Drive/TFM'

# Change the current working directory to the TFM directory
os.chdir(TFM_PATH)
print(f"Current working directory changed to: {os.getcwd()}")

# Add the TFM directory to the Python system path
if TFM_PATH not in sys.path:
    sys.path.append(TFM_PATH)
    print(f"'{TFM_PATH}' added to Python system path.")
else:
    print(f"'{TFM_PATH}' is already in Python system path.")

In [None]:
# Preparación e instalación de librerias

!pip install requests
!pip install beautifulsoup4

In [None]:
# Carga librerias
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup
import subprocess
import re
import extractData


In [None]:
# Borra columnas de un dataset
def delete_columns_df(df, cols_to_delete):
  df = df.drop(columns=cols_to_delete, errors='ignore')
  return df

In [None]:
# Obtiene las URLs de descarga desde la página de perfiles de participantes
# Función muy adaptada a esta pagina
def get_url_download_from_profile(participant_id, soup, web_base,data_list=[]):
    # download_urls = []
    columns = ['participant_id', 'date', 'data_type', 'download_url']
    download_data_df = pd.DataFrame(columns=columns)

    # 1. Busca el encabezado 'Uploaded data'
    uploaded_data_header = soup.find('h3', string='Uploaded data ')

    if uploaded_data_header:
        # 2. Despues del encabezado <h3>, busca <div> con la clase profile-data
        profile_data_div = uploaded_data_header.find_next_sibling('div', class_='profile-data')

        if profile_data_div:
            # 3. Busca la tabla dentro del perfil.
            table = profile_data_div.find('table')

            if table:
                # 4. Identifica el índice de la columna download
                download_col_idx = -1
                date_col_idx = -1
                data_type_col_idx = -1
                try:
                    headers = [th.get_text(strip=True) for th in table.find('thead').find_all('th')]
                    for i, header in enumerate(headers):
                        if header == 'Download':
                            download_col_idx = i
                        elif header == 'Date':
                            date_col_idx = i
                        elif header == 'Data type':
                            data_type_col_idx = i
                except Exception as e:
                    print(f"Error occurred while parsing table headers: {e}")

                if download_col_idx != -1:
                    # 5. Itera por cada una de las columas para coger la url de descarga
                    for row in table.find_all('tr'):    #find('tbody').
                        full_url = ''
                        # 6. Para cada fila, accede a la celda (<td>) que corresponde a la columna 'Download'
                        cells = row.find_all('td')
                        if len(cells) > download_col_idx:
                            download_cell = cells[download_col_idx]
                            # 7. Dentro de esta celda,extrae el atributo href del <a> tag.
                            download_link_tag = download_cell.find('a')
                            if download_link_tag and download_link_tag.has_attr('href'):
                                relative_href = download_link_tag['href']
                                # 8. Si la URL es relativa la convierte en full
                                if relative_href.startswith('/'):
                                    full_url = web_base.rstrip('/') + relative_href
                                else:
                                    full_url = relative_href
                            # Rellena el dataframe cuando la lista de tipos está vacia
                            # Cuando la lista no está vacia y el tipo de datos se encuentra en la lista
                            if (not data_list) or (len(data_list) > 0 and cells[data_type_col_idx].text.strip() in data_list):
                                new_row_data = {
                                  'participant_id': participant_id,
                                  'date': cells[date_col_idx].text.strip(),
                                  'data_type': cells[data_type_col_idx].text.strip(),
                                  'download_url': full_url
                                }
                                new_row_df = pd.DataFrame([new_row_data])
                                download_data_df = pd.concat([download_data_df, new_row_df], ignore_index=True)

                else:
                    print(f"'Download' column not found in table for {participant_id}.")
            else:
                print(f"No table found in 'profile-data' div for {participant_id}.")
        else:
            print(f"'profile-data' div not found after 'Uploaded data' header for {participant_id}.")
    else:
        print(f"'Uploaded data' header not found for {participant_id}.")
    return download_data_df


In [None]:
# Dada una lista de participantes, descarga todos sus datos y los almacena en
# el directorio indicado
def get_data_participants(participant_list,web_base,web_profile,genetic_path,data_type_list=[],download = False):

    columns = ['participant_id', 'date', 'data_type', 'download_url']
    info_participants_df = pd.DataFrame(columns=columns)

    for participant_id in participant_list:
        print(f"\nProcessing participant: {participant_id}")

        if download:
            participant_dir = os.path.join(genetic_path, participant_id)
            os.makedirs(participant_dir, exist_ok=True)
            print(f"Ensured directory exists: {participant_dir}")

        profile_url = web_profile + participant_id
        print(f"Fetching profile from: {profile_url}")

        try:
            final_url, filename, soup = extractData.get_html_parser(profile_url)

            # Si no devuelve ningún contenido entonces continua con el siguiente perfil
            # Esto significa que la página del participante no se ha encontrado
            if not soup:
                print(f"No soup found for {participant_id}.")
                continue
            # Busca las páginas de descarga bajo el perfil del paciente y obtiene las urls
            download_df = get_url_download_from_profile(participant_id, soup, web_base,data_type_list)
            # Concatenar resultados
            info_participants_df = pd.concat([info_participants_df, download_df], ignore_index=True)
            download_urls = download_df['download_url'].tolist()

            if (not download) | (not download_urls):
                print(f"No download links found for {participant_id}.")
            else:
                print(f"Found {len(download_urls)} download links for {participant_id}.")
                # Execute each wget command using the extracted URLs
                for url in download_urls:
                    if url:
                        #Obtiene la url real de la página
                        # orig_url = get_real_url(url)

                        # Parsea la página
                        orig_url, filename, soup = extractData.get_html_parser(url)

                        #Obtiene la lista de ficheros a descargar
                        if not soup:
                          download_file = []
                        else:
                          download_file = extractData.get_list_genetic_data(soup, orig_url)

                        if len(download_file) == 0:
                            # Si no ha encontrado fichero a descargar, trata de descargar la url original
                            print(f"Descargando... {orig_url}")
                            extractData.get_download_file(orig_url,participant_dir,filename)
                        else:
                            # Descarga uno a uno todos los datos genéticos
                            for url_file in download_file:
                              print(f"Descargando... {url_file}")
                              extractData.get_download_file(url_file,participant_dir,None)

        except requests.exceptions.HTTPError as errh:
            print(f"HTTP Error for {participant_id}: {errh}")
        except requests.exceptions.ConnectionError as errc:
            print(f"Error Connecting for {participant_id}: {errc}")
        except requests.exceptions.Timeout as errt:
            print(f"Timeout Error for {participant_id}: {errt}")
        except requests.exceptions.RequestException as err:
            print(f"An unexpected error occurred for {participant_id}: {err}")

    return info_participants_df


In [None]:
# Define variables
# Define el directorio y el nombre del archivo en variables
folder_drive = '/content/drive/MyDrive/TFM/data/'
survey_data ="surveys/"
genetic_path = folder_drive + 'genetic_data/'
file_circulatoryDisease= 'PGPTrait&DiseaseSurvey2012_CirculatorySystem-20181010220109.csv'
file_general = 'PGPParticipantSurvey-20181010220019.csv'
file_basic_phenotypes = 'PGPBasicPhenotypesSurvey2015-20181010214636.csv'
cols_to_delete =['Do not touch!']
col_name_genetic = 'Have you uploaded genetic data to your PGP participant profile?'
col_name_healthRecord = 'Have you uploaded health record data using our Google Health or Microsoft Healthvault interfaces?'
column_name = 'Have you ever been diagnosed with one of the following conditions?'
pgp_web_base = 'https://my.pgp-hms.org'
pgp_web_profile = pgp_web_base + '/profile/'

genome_list = ['biometric data - CSV or similar','23andMe','	Complete Genomics','Counsyl','DeCode','	Family Tree DNA','Gencove',
                  'Illumina','Knome','Navigenics','	Pathway genomics','Veritas Genetics','AncestryDNA','Ancestry DNA','Ancestry','MyHeritage',
               'Geno 2.0 Ancestry Report']
complete_genome_list =['Complete Genomics','Veritas Genetics',
                       'Nebula','Sequencing.com','Full Genomes','whole genome','Nebula Genomics VCF file',
                       'Dante Labs full genome VCF Indel','genetic data - HiFi Reads','dante labs 2021 bloodkit whole genome','	nebula genomics bam file',
                       'nebula genomics data vcf','VCF from Dante Labs vs GRCh37 (gz)','genetic data - Dante Labs','from health nucleus',
                       'Helix','Dante Labs WGS','Helix Exome','Dante Labs VCF File','.vcf.gz','	genetic data - FGC Chromium 60x','genetic data - CeGaT',
                       'genetic data - FullDNA','genetic data-Full Genomes 60x chromium','Genetic data -- Full Genomes',
                       'Zip of 4x full genome analysis from Full Genome Corp','	Full Genomes Corporation, Inc. (Novogene)']
# Hay todavía más tipos de todos
health_records_list =['health records - CCR XML','health records - PDF or text','image','Microbiome','Color Health']
col_id = 'Participant'
sep=','

In [None]:
# Carga el fichero con la encuesta de condiciones cardiovasculares
folder = folder_drive + survey_data
df_data = extractData.load_csv_to_dataframe(folder, file_circulatoryDisease)

# Extraer todas las condiciones del fichero
all_cv_conditions = extractData.extract_conditions(df_data,column_name,sep)

# Crear dataset de participantes
df_participants = extractData.create_conditions_df(df_data, all_cv_conditions,col_id,column_name,sep)

# Cargar el dataset general
df_general = extractData.load_csv_to_dataframe(folder, file_general)

#Enlaza los datos df_general al df_participants
df_participants = pd.merge(df_participants, df_general, on='Participant', how='left')

# Borra columnas del dataset
df_participants = delete_columns_df(df_participants, cols_to_delete)

# Cargar el dataset de fenotipos básicos
df_basicphenotypes = extractData.load_csv_to_dataframe(folder, file_basic_phenotypes)

#Enlaza los datos df_participants
df_participants = pd.merge(df_participants, df_basicphenotypes, on='Participant', how='left')

# Borra columnas del dataset
df_participants = delete_columns_df(df_participants, cols_to_delete)


In [None]:
# Condiciones de filtrado
# Pacientes con hipertensión que hayan subido registros genéticos y de salud
condition_hypertension = df_participants['Hypertension'] == 1
condition_genetic_data = df_participants[col_name_genetic].isin(['Yes','Yes, I have uploaded genetic data'])
condition_health_record = df_participants[col_name_healthRecord] == 'Yes'

# Aplicar las condiciones al filtro
filtered_participants_df = df_participants[condition_hypertension & condition_genetic_data]

# Obtener los IDs de los participantes
participants_list = filtered_participants_df['Participant'].unique().tolist()

# Número total de participantes
total_participants_count = len(participants_list)

# Resultados
print(f"Nº total de participantes que cumplen los criterios: {total_participants_count}")
#print("\nListado de participantes que cumplen los criterios:")
#for participant_id in participants_list:
#    print(participant_id)



In [None]:
# Crea un directorio para almacenar la información genética
if not os.path.exists(genetic_path):
    os.makedirs(genetic_path)
    print(f"Directory '{genetic_path}' created successfully.")
else:
    print(f"Directory '{genetic_path}' already exists.")

# Itera por participantes y descarga todos los datos de los participantes con hipertensión
# que tienen datos genéticos
print(f"Starting data download for {len(participants_list)} participants.")
data_desc_df = get_data_participants(participants_list,pgp_web_base,pgp_web_profile,genetic_path,data_type_list=[],download = False)

print(f"Nª de participantes con hipertension y que indica en los registros que tienen datos genómicos {len(data_desc_df["participant_id"].unique())}")
data_desc_df.to_csv(folder_drive +'participantes_hipertension_geneticos.csv', sep=',', index=False)

In [None]:
## Celda de test
# participant_list =['hu654B61']
participant_list = ['huE31062']

# print(get_real_url('https://my.pgp-hms.org/user_file/download/96'))


original_url = 'https://my.pgp-hms.org/user_file/download/96'
final_url, filename, soup = extractData.get_html_parser(original_url)
print(final_url)
print(filename)

data_desc_df = get_data_participants(participant_list, pgp_web_base, pgp_web_profile, genetic_path, data_type_list=[], download=True)

In [None]:
# Extrae los ids de los participantes que tienen hipertensión
# Busca si tienen datos genéticos usando el crawler
# Indica el tipo de datos genéticos que tiene

# Listado de IDs de participantes con hipertensión
condition_hypertension = df_participants['Hypertension'] == 1

# Aplicar las condiciones al filtro
filtered_participants_hypertension_df = df_participants[condition_hypertension]

# Obtener los IDs de los participantes
participants_hypertension_list = filtered_participants_hypertension_df['Participant'].unique().tolist()

# Número total de participantes
total_participants_count = len(participants_hypertension_list)
hypertension_folder = folder_drive + 'hypertension_data/'

participant_info_genomic_data = get_data_participants(participants_hypertension_list,pgp_web_base,pgp_web_profile,hypertension_folder,complete_genome_list,download=False)

print(f"Nª de participantes con hipertension y datos genómicos {len(participant_info_genomic_data["participant_id"].unique())}")

participant_hyper_list = participant_info_genomic_data['participant_id'].unique().tolist()

participant_info_genomic_data.to_csv(folder_drive +'participantes_hipertension.csv', sep=',', index=False)

In [None]:
# Recoge sólo la información de aquellos pacientes que tienen hipertensión y que tienen "registros genéticos completos"
participants_hyper_gen_df = filtered_participants_hypertension_df[df_participants['Participant'].isin(participant_info_genomic_data['participant_id'].unique().tolist())]

print(len(participants_hyper_gen_df))
print(len(participants_hyper_gen_df['Participant'].unique()))

participant_hyper_list = participant_info_genomic_data['participant_id'].unique().tolist()

print(len(participant_hyper_list))
print(participant_hyper_list)

print(participants_hyper_gen_df.columns)
# print(participants_hyper_gen_df.head())


In [None]:
# Extrae los ids de los participantes que NO tienen hipertensión
# Busca si tienen datos genéticos usando el crawler
# Indica el tipo de datos genéticos que tiene

# Listado de IDs de participantes SIN hipertensión
condition = df_participants['Hypertension'] == 0

# Aplicar las condiciones al filtro
filtered_participants_control_df = df_participants[condition]

# Obtener los IDs de los participantes
participants_control_list = filtered_participants_control_df['Participant'].unique().tolist()

# Número total de participantes
total_participants_count = len(participants_control_list)

hypertension_folder = folder_drive + 'hypertension_data/'
print(f"Nª de participantes SIN hipertension: {total_participants_count}")

participant_info_genomic_data = get_data_participants(participants_control_list,pgp_web_base,pgp_web_profile,hypertension_folder,complete_genome_list,download=False)

print(f"Nª de participantes SIN hipertension y con datos genómicos {len(participant_info_genomic_data["participant_id"].unique())}")

participant_control_list = participant_info_genomic_data['participant_id'].unique().tolist()

participant_info_genomic_data.to_csv(folder_drive +'participantes_control.csv', sep=',', index=False)

In [None]:
# Recoge sólo la información de aquellos pacientes que NO tienen hipertensión y que tienen "registros genéticos completos"
participants_control_gen_df = filtered_participants_control_df[df_participants['Participant'].isin(participant_info_genomic_data['participant_id'].unique().tolist())]

print(len(participants_control_gen_df))
print(len(participants_control_gen_df['Participant'].unique()))

participant_control_list = participant_info_genomic_data['participant_id'].unique().tolist()

print(len(participant_control_list))
print(participant_control_list)

print(participants_control_gen_df.columns)
# print(participants_hyper_gen_df.head())


In [None]:
# Seleccionar IDs de participantes con hipertensión
condition_hypertension = df_participants['Hypertension'] == 1
filtered_participants_hypertension_df = df_participants[condition_hypertension]
participants_hypertension_list = filtered_participants_hypertension_df['Participant'].unique().tolist()

print(f"Number of participants in `participants_hypertension_list`: {len(participants_hypertension_list)}")
print(f"First 5 participants: {participants_hypertension_list[:5]}")

Number of participants in `participants_hypertension_list`: 414
First 5 participants: ['hu68A2D7', 'hu46125C', 'hu1EE386', 'huE31062', 'hu7123C1']


In [None]:
public_genetic_data_url = 'https://my.pgp-hms.org/public_genetic_data'
print(f"Public genetic data URL set to: {public_genetic_data_url}")

In [None]:
# Hace un crawler de la página 'https://my.pgp-hms.org/public_genetic_data'
# recogiendo los metadatos de todos los ficheros cargados por los usuarios
def parse_file_size(size_str):
    if pd.isna(size_str):
        return None, None
    match = re.match(r'(\d+\.?\d*)\s*([KMGT]?B)', str(size_str))
    if match:
        value = float(match.group(1))
        unit = match.group(2).strip()
        return value, unit
    return None, None


def scrape_public_genetic_data(public_genetic_data_url, pgp_web_base):
    data_records = []
    try:
        orig_url, filename, soup = extractData.get_html_parser(public_genetic_data_url)

        table = soup.find('table')
        if not table:
            print("Could not find table on the page.")
            return pd.DataFrame()

        headers = [th.get_text(strip=True) for th in table.find('thead').find_all('th')]
        print(f"Found table headers: {headers}")

        # Map actual table headers to desired DataFrame column names
        header_mapping_to_target = {
            'Participant': 'participant_id',
            'Published': 'publication_date',
            'Data type': 'data_type',
            'Name': 'file_name',
            'Download': 'download_url'
        }

        # Get column indices for easy access based on actual headers
        col_indices = {}
        found_all_expected_headers = True
        for actual_header, target_col_name in header_mapping_to_target.items():
            if actual_header in headers:
                col_indices[target_col_name] = headers.index(actual_header)
            else:
                print(f"Missing expected header in page: '{actual_header}'. Cannot proceed with scraping.")
                found_all_expected_headers = False
                break

        # Add a mapping for file size that points to the 'Download' column index for extraction
        if 'download_url' in col_indices:
            col_indices['file_size_source'] = col_indices['download_url'] # Use download column to get file size
        else:
            print("Missing 'Download' header. Cannot extract file size.")
            found_all_expected_headers = False

        if not found_all_expected_headers:
            return pd.DataFrame()

        import re
        for row in table.find_all('tr'):
            cols = row.find_all('td')
            if len(cols) > 0: # Ensure it's a data row
                record = {}
                participant_id = cols[col_indices['participant_id']].get_text(strip=True)
                if ',' in participant_id:
                  parts = participant_id.split(',', 1) # Split only on the first comma
                  record['participant_id'] = parts[0].strip()
                  record['source'] = parts[1].strip()
                else:
                  record['participant_id'] = participant_id
                  record['source'] = None

                record['publication_date'] = cols[col_indices['publication_date']].get_text(strip=True)
                record['data_type'] = cols[col_indices['data_type']].get_text(strip=True)
                record['file_name'] = cols[col_indices['file_name']].get_text(strip=True)

                # Extract file size
                file_size_text = cols[col_indices['file_size_source']].get_text(separator=' ', strip=True)
                file_size_match = re.search(r'\((.*?)\)', file_size_text)
                if file_size_match:
                    record['file_size'] = file_size_match.group(1).strip()
                    record['file_size_value'], record['file_size_unit'] = parse_file_size(record['file_size'])
                else:
                    record['file_size'] = None
                    record['file_size_value'] = 0
                    record['file_size_unit'] = ''

                # print(f"'##### Participant:' {record['participant_id']} '#######'")
                download_link = cols[col_indices['download_url']].find('a')
                full_download_url = ''
                if download_link and download_link.has_attr('href'):
                    relative_href = download_link['href']
                    if relative_href.startswith('/'):
                        full_download_url = pgp_web_base.rstrip('/') + relative_href
                    else:
                        full_download_url = relative_href
                record['download_url'] = full_download_url

                data_records.append(record)

    except requests.exceptions.RequestException as e:
        print(f"Error fetching or parsing URL: {e}")
    except Exception as e:
        print(f"An unexpected error occurred during scraping: {e}")

    public_genetic_df = pd.DataFrame(data_records)
    return public_genetic_df

# Recorre la página https://my.pgp-hms.org/public_genetic_data y carga en una tabla sus datos
public_genetic_df = scrape_public_genetic_data(public_genetic_data_url, pgp_web_base)

print("Scraped Public Genetic Data:")
print(public_genetic_df.head())
print(f"Total records scraped: {len(public_genetic_df)}")

# Guarda la información en un fichero
public_genetic_df.to_csv(folder_drive +'public_genetic_data.csv', sep=',', index=False)