In [1]:
# import necessary packages
from geofetch import Geofetcher
import pysradb.sraweb
import pandas as pd
import os
import urllib.request
import sys
from IPython.display import display
from ipywidgets import interact, Dropdown, Button, Text
import traitlets
from traitlets import dlink
pd.set_option('display.max_rows', None)

  from tqdm.autonotebook import tqdm


In [2]:
# makes folder for files to download
curr_path = os.getcwd()
download_path = os.path.join(curr_path, 'downloaded_files')
try: 
    os.mkdir(download_path)
except:
    pass

#### GSE gathering functions

In [3]:
def find_sub_gse(super_gse):
    """
    Using a super series accession code, gets the sub series accession codes via webscrape
    
    input: super_gse (str)
    output: sub_gse (str list)
    """
    
    import requests
    import time
    from bs4 import BeautifulSoup
    import regex as re
    
    # webscrapping
    base_link = 'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc='
    response = requests.get(base_link+super_gse)
    start = time.time()
    soup = BeautifulSoup(response.text, features='lxml')
    
    pattern = re.compile(r'^GSE') # pattern looking for in text
    href = '/geo/query/acc.cgi?acc' # pattern looking for in links
    a_href = soup.find_all('a', href=True, text=pattern)
    
    # finds sub gse accession codes
    sub_gse = []
    for sub in a_href:
        temp = sub.text.strip().upper()
        if temp != super_gse:
            sub_gse.append(temp)
    
    # to accomodate robots.txt, must wait 5 seconds before web scrape again
    end = time.time()
    to_wait = 5 - (end-start)
    if to_wait < 0:
        to_wait = 0
    time.sleep(to_wait)
    
    return sub_gse

In [11]:
def get_series_files(gse):
    """
    uses geofetch to get supplemental data files using a GSE accession #
    input: gse (str)
    output: df of supp data file info
    """
    # get supplemental data
    geof = Geofetcher(processed=True, data_source='all', just_metadata=True, discard_soft = True)
    proj = geof.get_projects(gse)
    
    # check if supplemental data exists
    gse_series = gse + '_series'
    if gse_series in proj.keys():
        df = proj[gse_series].sample_table
        # clean columns, names for consistency when concatenating df's
        keep = ['file','sample_name', 'series_sample_organism', 'series_sample_id', 'file_url', 'series_overall_design','series_type']
        df= df[keep]
        df['study_alias'] = [gse] * df.shape[0]
        return df.rename(columns={'series_sample_organism': 'organism_name',
                                 'sample_name':'title',
                                 'series_sample_id':'GSM',
                                 'series_overall_design':'description'}).reset_index(drop=True)
    else:
        return pd.DataFrame()

In [12]:
def get_descriptions(gse, where_save):
    """
    Gets a compliation of metadata for selection
    input: gse (str), where_save (str)
    output: df & downloads .csv
    """
    ### input validation ###
    gse = gse.strip()
    gse = gse.upper()
    print('Downloading {}...'.format(gse))
    if (not type(gse) is str) or (gse[:3] != 'GSE'):
        print("Incorrect formatting! Input must be a string starting with GSE followed by numbers")
        return
    if (gse == ''):
        print("please enter input")
        return
    ### end of validation ###
    
    def clean_df(df):
        cols_keep = ['run_accession', 'experiment_title', 'study_alias', 'organism_name', 'experiment_alias', 'experiment_desc','total_spots', 'total_size', 'run_total_spots','run_total_bases']
        output =  df[cols_keep].rename(columns={'run_accession':'file',
                                    'experiment_title':'title', 
                                     'experiment_desc': 'description',
                                    'experiment_alias' : 'GSM'})
        return output
        
    
    # gets relevant information
    geo_helper = pysradb.sraweb.SRAweb()
    gsm_srx = geo_helper.gse_to_gsm(gse)
    
    try:
        srp = geo_helper.gse_to_srp(gse)['study_accession'][0] # gets the srp code
    
    # catches error that happens if gse is a super_gse, performs webscraping
    except ValueError:
        sub_gse_lst = find_sub_gse(gse)
    else:
        sub_gse_lst = [gse]
    
    finally:
        output = pd.DataFrame()
        series_df = pd.DataFrame()
        
        for sub_gse in sub_gse_lst:
            # for sra data
            srp = geo_helper.gse_to_srp(sub_gse)['study_accession'][0] # gets the srp code
            df = geo_helper.sra_metadata(srp) # contains meta info of data for gse
            sub_output = pd.merge(gsm_srx, df)
            output = pd.concat([output, sub_output], axis=0, ignore_index=True).reset_index(drop=True)
            # for supplemental data
            sub_series = get_series_files(sub_gse)
            series_df = pd.concat([series_df, sub_series], axis=0, ignore_index=True).reset_index(drop=True)
        
        # clean output data frame
        output =  clean_df(output)
        if series_df.shape != (0,0):
            # concat srr (fastq) and supp series data frames
            output = pd.concat([output, series_df], axis=0, ignore_index=True).reset_index(drop=True)

        # turn output into csv, download it
        filename = gse + '_metadata.csv'
        filename = os.path.join(where_save, filename)
        output.to_csv(filename)
        print('\nA .csv version of this dataframe has been saved at {}'.format(filename))
    return output

In [13]:
def download_files(csv, path):
    """
    Downloads files that partial csv contains (all rows)
    input: csv path and download path
    output: none
    """
    df = pd.read_csv(csv)
    
    # split btw fastq and supplemental files
    lst = df['file']
    srr_to_download = lst[lst.apply(lambda x: 'SRR' in x)].to_list()
    supp_to_download = lst[~lst.apply(lambda x: 'SRR' in x)].index.to_list()
    
    # makes folder for files to download
    download_path = os.path.join(path, 'downloaded_files')
    try: 
        os.mkdir(download_path)
    except:
        pass
    
    ### downloading supplemental files ###
    print('Downloading supplemental files...\n')
    for supp_idx in supp_to_download:
        download_link = 'https://' + df.iloc[supp_idx]['file_url'].split("//")[-1]
        file_name = os.path.join(download_path, download_link.split("/")[-1])
        urllib.request.urlretrieve(download_link, file_name)

    ### downloading fastq files ###
    print('Downloading FASTQ files... this make take a bit...\n')
    for srr in srr_to_download:
        # prefetch srr files for quicker download
        download_path = os.path.join(path, 'downloaded_files')
        pf_cmd = 'prefetch {} -O {}'.format(srr, download_path)
        print(pf_cmd)
        os.system(pf_cmd)
        # download fastq
        path = os.path.join(download_path,os.path.join(srr, srr+'.sra'))
        fastq_cmd = 'fasterq-dump --outdir {} {}'.format(os.path.join(download_path, srr), path)
        print(fastq_cmd)
        os.system(fastq_cmd)

    print('\nComplete\n')

### GUI

In [14]:
def print_pretty_table(csv_path):
    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(csv_path)
    # Display the DataFrame using IPython display
    display(df)

In [15]:
def get_maximum_k(directory):
    # Get all files in the directory
    files = os.listdir(directory)
    
    # Filter for files starting with Partial_
    partial_files = [f for f in files if f.startswith("Partial_")]
    if len(partial_files) == 0:
        return 0
   
    # Extract the k values from each filename
    k_values = [int(f.split("_")[1]) for f in partial_files]
   
    # Return the maximum k value
    return max(k_values)

In [16]:
def browse_and_select_rows(csv_path, column_name):
    """
    modified original method for binder use
    """
    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(csv_path)
    df = df.rename(columns={'Unnamed: 0':'index'})

    # Create a dropdown widget to select items from the column
    options_list = [''] +df[column_name].unique().tolist()
    dropdown = Dropdown(options= options_list)
    
    save_path_input = download_path
    
    # Create a button widget for saving and downloading
    button = Button(description='Save and Download')
    
    # Create an empty DataFrame to store selected rows
    selected_rows_df = pd.DataFrame()
    
    # Function to display filtered rows based on the selected item
    def display_filtered_rows(selected_item):
        nonlocal selected_rows_df
        
        # Filter the DataFrame based on the selected item
        filtered_df = df[df[column_name] == selected_item]
        
        # Append the filtered rows to the selected rows DataFrame
        selected_rows_df = pd.concat([selected_rows_df, filtered_df], ignore_index=True)
        
        # Display the selected rows
        display(filtered_df)
        
        # Print the growing list of selected rows
        print("Selected Rows:")
        for i in range(len(selected_rows_df)):
            row_entry = str(selected_rows_df.loc[i, column_name])[:60] + '...' if len(str(selected_rows_df.loc[i, column_name])) > 60 else str(selected_rows_df.loc[i, column_name])
            print(f"Row {i+1}: {row_entry}")
        print()  # Print an empty line for separation
        
    # Function to handle the save and download button click event
    def save_and_download(_):
        nonlocal selected_rows_df
        
        # Check if any rows are selected
        if selected_rows_df.empty:
            print("No rows are selected.")
            return
        
        # Get the save path directory from the input text widget
        save_directory = save_path_input
        
        # Get the directory and file part of the original CSV file path
        directory = os.path.dirname(csv_path)
        file_name = os.path.basename(csv_path)
        file_part = os.path.splitext(file_name)[0]
        file_extension = os.path.splitext(file_name)[1]
        
        # Find the next available k for the Partial_k file name in the directory
        k = get_maximum_k(save_directory)
        
        # Create the Partial_k file name
        partial_k_filename = f"Partial_{k+1}_{file_part}.csv"
        
        # Combine the save directory, partial_k file name, and save path to form the final save path
        save_path = os.path.join(save_directory, partial_k_filename)
        
        # Write the selected rows to a new CSV file
        selected_rows_df.to_csv(save_path, index=False)
        
        # Print the save path and exit the program
        print(f"Saved CSV file: {save_path}")
        
        # download the files in partial csv
        download_files(save_path, save_directory)
        
        # Clear the selected rows DataFrame for the next iteration
        selected_rows_df = pd.DataFrame()
    
    # Connect the dropdown widget to the display function
    interact(display_filtered_rows, selected_item=dropdown)
    
    # Connect the button widget to the save and download function
    button.on_click(save_and_download)
    
    # Display the input text widget and button widget
    display(button) 

In [17]:
# makes selection easier for user end on binder
def preview_data_visual(gse):
    '''
    input: gse (str) - GSE accession number
    '''
    get_descriptions(gse, download_path)

In [18]:
def display_csv(csv_path):
    """
    displays csv for binder
    input: csv_path (str) - path of csv to turn into a dataframe
    """
    return pd.read_csv(csv_path, index_col=0)