In [1]:
# import necessary packages
from geofetch import Geofetcher
import pysradb.sraweb
import pandas as pd
import os
import urllib.request

  from tqdm.autonotebook import tqdm


### Function 1 : get metadata

In [21]:
def get_series_files(gse):
    """
    uses geofetch to get supplemental data files using a GSE code
    input: gse (str)
    output: df of supp data file info
    """
    # get supplemental data
    geof = Geofetcher(processed=True, data_source='all', just_metadata=True, discard_soft = True)
    proj = geof.get_projects(gse)
    
    # check if supplemental data exists
    gse_series = gse + '_series'
    if gse_series in proj.keys():
        df = proj[gse_series].sample_table
        # clean columns, names for consistency when concatenating df's
        keep = ['file','sample_name', 'series_sample_organism', 'series_sample_id', 'file_url', 'series_overall_design','series_type']
        df= df[keep]
        return df.rename(columns={'series_sample_organism': 'organism_name',
                                 'sample_name':'title',
                                 'series_sample_id':'GSM',
                                 'series_overall_design':'description'}).reset_index(drop=True)
    else:
        return pd.DataFrame()

In [22]:
def get_descriptions(gse):
    """
    Gets a compliation of metadata for selection
    input: gse (str)
    output: df & downloads .csv
    """
    ### input validation ###
    if (not type(gse) is str) or (gse[:3] != 'GSE'):
        return "input must be a string starting with GSE followed by numbers"
    ### end of validation ###
    
    # gets relevant information
    geo_helper = pysradb.sraweb.SRAweb()
    gsm_srx = geo_helper.gse_to_gsm(gse)
    srp = geo_helper.gse_to_srp(gse)['study_accession'][0] # gets the srp code
    df = geo_helper.sra_metadata(srp) # contains meta info of data for gse
    output = pd.merge(gsm_srx, df)
    
    # column and name cleaning for consistency for easier concat of df's
    cols_keep = ['run_accession', 'experiment_title', 'organism_name', 'experiment_alias', 'experiment_desc','total_spots', 'total_size', 'run_total_spots','run_total_bases']
    output =  output[cols_keep].rename(columns={'run_accession':'file',
                                'experiment_title':'title', 
                                 'experiment_desc': 'description',
                                'experiment_alias' : 'GSM'})
    series_df = get_series_files(gse)
    if series_df.shape != (0,0):
        # concat srr (fastq) and supp series data frames
        output = pd.concat([output, series_df], axis=0).reset_index(drop=True)
    
    # turn output into csv, download it
    where_saved = os.getcwd()
    filename = gse + '_metadata.csv'
    output.to_csv(filename)
    print('A .csv version of this dataframe has been saved as {} at {}'.format(filename, where_saved))

    return output
    

### Function 2: download data from metadata

To do: optimize fasterq-dump speed

In [23]:
def download_files(gse, idx_list):
    """
    downloads file using the gse from above function, and a list of the indexes wish to download
    inputs: idx_list (list of ints that correspond to index) and gse code (str)
    outputs: downloads files, no output
    """
    
    ### input validation ###
    # check data types
    if not type(idx_list) is list:
        return "Cannot run: Inputs must be lists of ints"
    if not all(type(x) is int for x in idx_list):
        return "Cannot run: Inputs must be lists of ints"
    # check gse input
    if (not type(gse) is str) or (gse[:3] != 'GSE'):
        return "input must be a string starting with GSE followed by numbers"
    csv_fp = os.path.join(os.getcwd(), gse + '_metadata.csv')
    if not os.path.exists(csv_fp):
        return "Cannot run: csv path for this gse does not exist"
    df = pd.read_csv(csv_fp, index_col=0)
    # check index list input
    if not all(x in df.index.to_list() for x in idx_list):
        return "Cannot run: Invalid index"
    ### end of validation ###
    
    
    # split btw fastq and supplemental files
    lst = df['file'].iloc[idx_list]
    srr_to_download = lst[lst.apply(lambda x: 'SRR' in x)].to_list()
    supp_to_download = lst[~lst.apply(lambda x: 'SRR' in x)].index.to_list()
    
    # makes folder for files to download
    download_path = os.path.join(os.getcwd(), 'downloaded_files')
    try: 
        os.mkdir(download_path)
    except:
        pass
    
    ### downloading supplemental files ###
    print('Downloading supplemental files...')
    for supp_idx in supp_to_download:
        download_link = 'https://' + df.iloc[supp_idx]['file_url'].split("//")[-1]
        target_directory = os.getcwd()
        file_name = os.path.join('downloaded_files', download_link.split("/")[-1])
        urllib.request.urlretrieve(download_link, file_name)

    ### downloading fastq files ###
    print('Downloading FASTQ files... this make take a bit...')
    for srr in srr_to_download:
        # prefetch srr files for quicker download
        parent_path = os.getcwd()
        download_path = os.path.join(parent_path, 'downloaded_files')
        pf_cmd = 'prefetch {} -O {}'.format(srr, download_path)
        print(pf_cmd)
        os.system(pf_cmd)
        # download fastq
        path = os.path.join(download_path,os.path.join(srr, srr+'.sra'))
        fastq_cmd = 'fasterq-dump --outdir {} {}'.format(os.path.join(download_path, srr), path)
            # increase --threads to speed up performance?
        print(fastq_cmd)
        os.system(fastq_cmd)

    print('Complete')

In [24]:
from IPython.display import display

def print_pretty_table(csv_file):
    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(csv_file)
    
    # Display the DataFrame using IPython display
    display(df)


#### Testing functions using Glass SALL1 data

In [25]:
# this example uses SALL1 data
sal1_gse = 'GSE226090'
output = get_descriptions(sal1_gse);


Metadata folder: /home/pking/get_geo/project_name
Trying GSE226090 (not a file) as accession...
Trying GSE226090 (not a file) as accession...
Skipped 0 accessions. Starting now.
[38;5;200mProcessing accession 1 of 1: 'GSE226090'[0m
Total number of processed SERIES files found is: 20
Expanding metadata list...
Expanding metadata list...


Finished processing 1 accession(s)
Cleaning soft files ...
No files found. No data to save. File /home/pking/get_geo/project_name/GSE226090_samples/GSE226090_samples.csv won't be created
Unifying and saving of metadata... 


A .csv version of this dataframe has been saved as GSE226090_metadata.csv at /home/pking/get_geo


In [26]:
output.head()

Unnamed: 0,file,title,organism_name,GSM,description,total_spots,total_size,run_total_spots,run_total_bases,file_url,series_type
0,SRR23613876,"GSM7063497: PU1+ nuclei, H3K27ac Input ChIPseq...",Mus musculus,GSM7063497,"GSM7063497: PU1+ nuclei, H3K27ac Input ChIPseq...",10680471,656843723,10680471,2157455142,,
1,SRR23613877,"GSM7063496: PU1+ nuclei, H3K27ac Input ChIPseq...",Mus musculus,GSM7063496,"GSM7063496: PU1+ nuclei, H3K27ac Input ChIPseq...",9623829,594129205,9623829,1944013458,,
2,SRR23613880,"GSM7063493: Olig2+ nuclei, H3K27ac Input ChIPs...",Mus musculus,GSM7063493,"GSM7063493: Olig2+ nuclei, H3K27ac Input ChIPs...",7819623,481494974,7819623,1579563846,,
3,SRR23613882,"GSM7063492: Olig2+ nuclei, H3K27ac Input ChIPs...",Mus musculus,GSM7063492,"GSM7063492: Olig2+ nuclei, H3K27ac Input ChIPs...",10676675,661452028,10676675,2156688350,,
4,SRR23613885,"GSM7063489: NeuN+ nuclei, H3K27ac Input ChIPse...",Mus musculus,GSM7063489,"GSM7063489: NeuN+ nuclei, H3K27ac Input ChIPse...",8012260,499551998,8012260,1618476520,,


In [27]:
output.tail()

Unnamed: 0,file,title,organism_name,GSM,description,total_spots,total_size,run_total_spots,run_total_bases,file_url,series_type
76,GSE226090_WT_P300_idr_peak.peak.txt.gz,gse226090_wt_p300_idr_peak_peak_txt_gz,Mus musculus,"GSM7063466, GSM7063467, GSM7063468, GSM7063469...","Poly-A RNA-seq, ATAC-seq, and H3K27ac-ChIP-seq...",,,,,ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE226nn...,Genome binding/occupancy profiling by high thr...
77,GSE226090_WT_Pu1_H3K27ac_rep1.txt.gz,gse226090_wt_pu1_h3k27ac_rep1_txt_gz,Mus musculus,"GSM7063466, GSM7063467, GSM7063468, GSM7063469...","Poly-A RNA-seq, ATAC-seq, and H3K27ac-ChIP-seq...",,,,,ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE226nn...,Genome binding/occupancy profiling by high thr...
78,GSE226090_WT_Pu1_H3K27ac_rep2.txt.gz,gse226090_wt_pu1_h3k27ac_rep2_txt_gz,Mus musculus,"GSM7063466, GSM7063467, GSM7063468, GSM7063469...","Poly-A RNA-seq, ATAC-seq, and H3K27ac-ChIP-seq...",,,,,ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE226nn...,Genome binding/occupancy profiling by high thr...
79,GSE226090_WT_SALL1_idr_peaks.txt.gz,gse226090_wt_sall1_idr_peaks_txt_gz,Mus musculus,"GSM7063466, GSM7063467, GSM7063468, GSM7063469...","Poly-A RNA-seq, ATAC-seq, and H3K27ac-ChIP-seq...",,,,,ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE226nn...,Genome binding/occupancy profiling by high thr...
80,GSE226090_WT_SMAD4_idr_peaks.txt.gz,gse226090_wt_smad4_idr_peaks_txt_gz,Mus musculus,"GSM7063466, GSM7063467, GSM7063468, GSM7063469...","Poly-A RNA-seq, ATAC-seq, and H3K27ac-ChIP-seq...",,,,,ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE226nn...,Genome binding/occupancy profiling by high thr...


In [63]:
!pip install traitlets
import pandas as pd
import os
import sys
from IPython.display import display
from ipywidgets import interact, Dropdown, Button, Text
#from IPython.utils import traitlets
import traitlets
from traitlets import dlink


[0m

In [64]:

def browse_and_select_rows(csv_file, column_name):
    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(csv_file)
    df = df.rename(columns={'Unnamed: 0':'index'})

    # Create a dropdown widget to select items from the column
    options_list = [''] +df[column_name].unique().tolist()
    dropdown = Dropdown(options= options_list)
    
    # Create an input text widget for the save path
    save_path_input = Text(value='./', description='Save Path:')
    
    # Create a button widget for saving and downloading
    button = Button(description='Save and Download')
    
    # Enable tab-completion for the save path input
    traitlets.dlink((save_path_input, 'value'), (save_path_input, 'placeholder'))
    
    # Create an empty DataFrame to store selected rows
    selected_rows_df = pd.DataFrame()
    
    # Function to display filtered rows based on the selected item
    def display_filtered_rows(selected_item):
        nonlocal selected_rows_df
        
        # Filter the DataFrame based on the selected item
        filtered_df = df[df[column_name] == selected_item]
        
        # Append the filtered rows to the selected rows DataFrame
      
        selected_rows_df = pd.concat([selected_rows_df, filtered_df], ignore_index=True)
        
        
        # Display the selected rows
        display(filtered_df)
        
        # Print the growing list of selected rows
        print("Selected Rows:")
        for i in range(len(selected_rows_df)):
            row_entry = str(selected_rows_df.loc[i, column_name])[:60] + '...' if len(str(selected_rows_df.loc[i, column_name])) > 60 else str(selected_rows_df.loc[i, column_name])
            print(f"Row {i+1}: {row_entry}")
        print()  # Print an empty line for separation
        
    # Function to handle the save and download button click event
    def save_and_download(_):
        nonlocal selected_rows_df
        
        # Check if any rows are selected
        if selected_rows_df.empty:
            print("No rows are selected.")
            return
        
        # Get the save path directory from the input text widget
        save_directory = save_path_input.value
        
        # Get the directory and file part of the original CSV file path
        directory = os.path.dirname(csv_file)
        file_name = os.path.basename(csv_file)
        file_part = os.path.splitext(file_name)[0]
        file_extension = os.path.splitext(file_name)[1]
        
        # Find the next available k for the Partial_k file name in the directory
        
        # Find the next available k for the Partial_k file name in the directory
        k = get_maximum_k(save_directory)
        # Create the Partial_k file name
        partial_k_filename = f"Partial_{k+1}_{file_part}.csv"
        

        # Combine the save directory, partial_k file name, and save path to form the final save path
        save_path = os.path.join(save_directory, partial_k_filename)
        
        # Write the selected rows to a new CSV file
        selected_rows_df.to_csv(save_path, index=False)
        
        # Print the save path and exit the program
        print(f"Saved CSV file: {save_path}")
        
        # Clear the selected rows DataFrame for the next iteration
        selected_rows_df = pd.DataFrame()
    
    # Connect the dropdown widget to the display function
    interact(display_filtered_rows, selected_item=dropdown)
    
    # Connect the button widget to the save and download function
    button.on_click(save_and_download)
    
    # Display the input text widget and button widget
    display(save_path_input, button) 

In [65]:
# usage example

In [16]:
browse_and_select_rows('/home/pking/get_geo/GSE226090_metadata.csv', 'title')

interactive(children=(Dropdown(description='selected_item', options=('', 'GSM7063497: PU1+ nuclei, H3K27ac Inp…

Text(value='./', description='Save Path:', placeholder='./')

Button(description='Save and Download', style=ButtonStyle())

FileNotFoundError: [Errno 2] No such file or directory: './listsl/'

Saved CSV file: ./lists/Partial_26_GSE226090_metadata.csv


In [67]:
browse_and_select_rows('/home/pking/get_geo/GSE226090_metadata.csv', 'index')

interactive(children=(Dropdown(description='selected_item', options=('', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,…

Text(value='./', description='Save Path:', placeholder='./')

Button(description='Save and Download', style=ButtonStyle())