In [41]:
# import necessary packages

!pip install traitlets

from geofetch import Geofetcher
import pysradb.sraweb
import pandas as pd
import os
import urllib.request
import sys
from IPython.display import display
from ipywidgets import interact, Dropdown, Button, Text
#from IPython.utils import traitlets
import traitlets
from traitlets import dlink

[0m

### Function 1 : get metadata

In [42]:
def get_series_files(gse):
    """
    uses geofetch to get supplemental data files using a GSE code
    input: gse (str)
    output: df of supp data file info
    """
    # get supplemental data
    geof = Geofetcher(processed=True, data_source='all', just_metadata=True, discard_soft = True)
    proj = geof.get_projects(gse)
    
    # check if supplemental data exists
    gse_series = gse + '_series'
    if gse_series in proj.keys():
        df = proj[gse_series].sample_table
        # clean columns, names for consistency when concatenating df's
        keep = ['file','sample_name', 'series_sample_organism', 'series_sample_id', 'file_url', 'series_overall_design','series_type']
        df= df[keep]
        return df.rename(columns={'series_sample_organism': 'organism_name',
                                 'sample_name':'title',
                                 'series_sample_id':'GSM',
                                 'series_overall_design':'description'}).reset_index(drop=True)
    else:
        return pd.DataFrame()

In [43]:
def get_descriptions(gse, where_save):
    """
    Gets a compliation of metadata for selection
    input: gse (str), where_save (str)
    output: df & downloads .csv
    """
    ### input validation ###
    if (not type(gse) is str) or (gse[:3] != 'GSE'):
        return "input must be a string starting with GSE followed by numbers"
    ### end of validation ###
    
    # gets relevant information
    geo_helper = pysradb.sraweb.SRAweb()
    gsm_srx = geo_helper.gse_to_gsm(gse)
    srp = geo_helper.gse_to_srp(gse)['study_accession'][0] # gets the srp code
    df = geo_helper.sra_metadata(srp) # contains meta info of data for gse
    output = pd.merge(gsm_srx, df)
    
    # column and name cleaning for consistency for easier concat of df's
    cols_keep = ['run_accession', 'experiment_title', 'organism_name', 'experiment_alias', 'experiment_desc','total_spots', 'total_size', 'run_total_spots','run_total_bases']
    output =  output[cols_keep].rename(columns={'run_accession':'file',
                                'experiment_title':'title', 
                                 'experiment_desc': 'description',
                                'experiment_alias' : 'GSM'})
    series_df = get_series_files(gse)
    if series_df.shape != (0,0):
        # concat srr (fastq) and supp series data frames
        output = pd.concat([output, series_df], axis=0).reset_index(drop=True)
    
    # turn output into csv, download it
    filename = gse + '_metadata.csv'
    filename = os.path.join(where_save, filename)
    output.to_csv(filename)
    print('\nA .csv version of this dataframe has been saved as {} at {}'.format(filename, filename))

    return output

### Function 2: download data from metadata

In [44]:
def download_files(csv, path):
    """
    downloads files that partial csv contains (all rows)
    input: csv path and download path
    """
    df = pd.read_csv(csv)
    
    # split btw fastq and supplemental files
    lst = df['file']
    srr_to_download = lst[lst.apply(lambda x: 'SRR' in x)].to_list()
    supp_to_download = lst[~lst.apply(lambda x: 'SRR' in x)].index.to_list()
    
    # makes folder for files to download
    download_path = os.path.join(path, 'downloaded_files')
    try: 
        os.mkdir(download_path)
    except:
        pass
    
    ### downloading supplemental files ###
    print('Downloading supplemental files...\n')
    for supp_idx in supp_to_download:
        download_link = 'https://' + df.iloc[supp_idx]['file_url'].split("//")[-1]
        file_name = os.path.join(download_path, download_link.split("/")[-1])
        urllib.request.urlretrieve(download_link, file_name)

    ### downloading fastq files ###
    print('Downloading FASTQ files... this make take a bit...\n')
    for srr in srr_to_download:
        # prefetch srr files for quicker download
        download_path = os.path.join(path, 'downloaded_files')
        pf_cmd = 'prefetch {} -O {}'.format(srr, download_path)
        print(pf_cmd)
        os.system(pf_cmd)
        # download fastq
        path = os.path.join(download_path,os.path.join(srr, srr+'.sra'))
        fastq_cmd = 'fasterq-dump --outdir {} {}'.format(os.path.join(download_path, srr), path)
        print(fastq_cmd)
        os.system(fastq_cmd)

    print('\nComplete\n')

### GUI

In [45]:
def print_pretty_table(csv_file):
    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(csv_file)
    
    # Display the DataFrame using IPython display
    display(df)

In [46]:
def get_maximum_k(directory):
    # Get all files in the directory
    files = os.listdir(directory)
    
    # Filter for files starting with Partial_
    partial_files = [f for f in files if f.startswith("Partial_")]
    if len(partial_files) == 0:
        return 0
   
    # Extract the k values from each filename
    k_values = [int(f.split("_")[1]) for f in partial_files]
   
    # Return the maximum k value
    return max(k_values)

In [47]:
def browse_and_select_rows(csv_file, column_name):
    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(csv_file)
    df = df.rename(columns={'Unnamed: 0':'index'})

    # Create a dropdown widget to select items from the column
    options_list = [''] +df[column_name].unique().tolist()
    dropdown = Dropdown(options= options_list)
    
    # Create an input text widget for the save path
    save_path_input = Text(value='./', description='Save Path:')
    
    # Create a button widget for saving and downloading
    button = Button(description='Save and Download')
    
    # Enable tab-completion for the save path input
    traitlets.dlink((save_path_input, 'value'), (save_path_input, 'placeholder'))
    
    # Create an empty DataFrame to store selected rows
    selected_rows_df = pd.DataFrame()
    
    # Function to display filtered rows based on the selected item
    def display_filtered_rows(selected_item):
        nonlocal selected_rows_df
        
        # Filter the DataFrame based on the selected item
        filtered_df = df[df[column_name] == selected_item]
        
        # Append the filtered rows to the selected rows DataFrame
        selected_rows_df = pd.concat([selected_rows_df, filtered_df], ignore_index=True)
        
        # Display the selected rows
        display(filtered_df)
        
        # Print the growing list of selected rows
        print("Selected Rows:")
        for i in range(len(selected_rows_df)):
            row_entry = str(selected_rows_df.loc[i, column_name])[:60] + '...' if len(str(selected_rows_df.loc[i, column_name])) > 60 else str(selected_rows_df.loc[i, column_name])
            print(f"Row {i+1}: {row_entry}")
        print()  # Print an empty line for separation
        
    # Function to handle the save and download button click event
    def save_and_download(_):
        nonlocal selected_rows_df
        
        # Check if any rows are selected
        if selected_rows_df.empty:
            print("No rows are selected.")
            return
        
        # Get the save path directory from the input text widget
        save_directory = save_path_input.value
        
        # Get the directory and file part of the original CSV file path
        directory = os.path.dirname(csv_file)
        file_name = os.path.basename(csv_file)
        file_part = os.path.splitext(file_name)[0]
        file_extension = os.path.splitext(file_name)[1]
        
        # Find the next available k for the Partial_k file name in the directory
        
        # Find the next available k for the Partial_k file name in the directory
        k = get_maximum_k(save_directory)
        # Create the Partial_k file name
        partial_k_filename = f"Partial_{k+1}_{file_part}.csv"
        

        # Combine the save directory, partial_k file name, and save path to form the final save path
        save_path = os.path.join(save_directory, partial_k_filename)
        
        # Write the selected rows to a new CSV file
        selected_rows_df.to_csv(save_path, index=False)
        
        # Print the save path and exit the program
        print(f"Saved CSV file: {save_path}")
        
        # download the files in partial csv
        download_files(save_path, save_directory)
        
        # Clear the selected rows DataFrame for the next iteration
        selected_rows_df = pd.DataFrame()
    
    # Connect the dropdown widget to the display function
    interact(display_filtered_rows, selected_item=dropdown)
    
    # Connect the button widget to the save and download function
    button.on_click(save_and_download)
    
    # Display the input text widget and button widget
    display(save_path_input, button) 

### Testing 
using GSE code from Glass SALL1 data

In [29]:
# this example uses SALL1 data
sal1_gse = 'GSE226090'
output = get_descriptions(sal1_gse, '/home/pking/outputs/');

Metadata folder: /home/pking/Desktop/download_geo/project_name
Trying GSE226090 (not a file) as accession...
Trying GSE226090 (not a file) as accession...
Skipped 0 accessions. Starting now.
[38;5;200mProcessing accession 1 of 1: 'GSE226090'[0m
Total number of processed SERIES files found is: 20
Expanding metadata list...
Expanding metadata list...


Finished processing 1 accession(s)
Cleaning soft files ...
No files found. No data to save. File /home/pking/Desktop/download_geo/project_name/GSE226090_samples/GSE226090_samples.csv won't be created
Unifying and saving of metadata... 


A .csv version of this dataframe has been saved as /home/pking/outputs/GSE226090_metadata.csv at /home/pking/outputs/GSE226090_metadata.csv


Usage Example

In [50]:
output.head()

Unnamed: 0,file,title,organism_name,GSM,description,total_spots,total_size,run_total_spots,run_total_bases,file_url,series_type
0,SRR23613876,"GSM7063497: PU1+ nuclei, H3K27ac Input ChIPseq...",Mus musculus,GSM7063497,"GSM7063497: PU1+ nuclei, H3K27ac Input ChIPseq...",10680471,656843723,10680471,2157455142,,
1,SRR23613877,"GSM7063496: PU1+ nuclei, H3K27ac Input ChIPseq...",Mus musculus,GSM7063496,"GSM7063496: PU1+ nuclei, H3K27ac Input ChIPseq...",9623829,594129205,9623829,1944013458,,
2,SRR23613880,"GSM7063493: Olig2+ nuclei, H3K27ac Input ChIPs...",Mus musculus,GSM7063493,"GSM7063493: Olig2+ nuclei, H3K27ac Input ChIPs...",7819623,481494974,7819623,1579563846,,
3,SRR23613882,"GSM7063492: Olig2+ nuclei, H3K27ac Input ChIPs...",Mus musculus,GSM7063492,"GSM7063492: Olig2+ nuclei, H3K27ac Input ChIPs...",10676675,661452028,10676675,2156688350,,
4,SRR23613885,"GSM7063489: NeuN+ nuclei, H3K27ac Input ChIPse...",Mus musculus,GSM7063489,"GSM7063489: NeuN+ nuclei, H3K27ac Input ChIPse...",8012260,499551998,8012260,1618476520,,


In [49]:
browse_and_select_rows('/home/pking/get_geo/GSE226090_metadata.csv', 'index')

interactive(children=(Dropdown(description='selected_item', options=('', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,â€¦

Text(value='./', description='Save Path:', placeholder='./')

Button(description='Save and Download', style=ButtonStyle())

Saved CSV file: /home/pking/outputs/Partial_3_GSE226090_metadata.csv
Downloading supplemental files...

Downloading FASTQ files... this make take a bit...

prefetch SRR23613876 -O /home/pking/outputs/downloaded_files



2023-07-10T23:18:13 prefetch.3.0.5: Current preference is set to retrieve SRA Normalized Format files with full base quality scores.
2023-07-10T23:18:13 prefetch.3.0.5: 1) Downloading 'SRR23613876'...
2023-07-10T23:18:13 prefetch.3.0.5: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability.
2023-07-10T23:18:13 prefetch.3.0.5:  Downloading via HTTPS...
2023-07-10T23:19:03 prefetch.3.0.5:  HTTPS download succeed
2023-07-10T23:19:04 prefetch.3.0.5:  'SRR23613876' is valid
2023-07-10T23:19:04 prefetch.3.0.5: 1) 'SRR23613876' was downloaded successfully
2023-07-10T23:19:04 prefetch.3.0.5: 'SRR23613876' has 0 unresolved dependencies


fasterq-dump --outdir /home/pking/outputs/downloaded_files/SRR23613876 /home/pking/outputs/downloaded_files/SRR23613876/SRR23613876.sra

Complete



spots read      : 10,680,471
reads read      : 21,360,942
reads written   : 21,360,942
