In [1]:
import os
import shutil


In [2]:
def find_results_csv(root_folder, prefix='2'):
    """
    Crawls through folders and their subfolders to find CSV files starting with 'results_'
    when parent folder starts with given prefix
    
    Args:
        root_folder (str): The root directory path to start searching from
        prefix (str): Prefix for folder names to search in (default: '2')
        
    Returns:
        list: List of full file paths for matching CSV files
    """
    csv_files = []
    
    try:
        for root, dirs, files in os.walk(root_folder):
            if os.path.basename(root).startswith(prefix):
                # Once we find a folder with prefix, search it and all its subfolders
                for subroot, subdirs, subfiles in os.walk(root):
                    for file in subfiles:
                        if file.startswith('results_') and file.endswith('.csv'):
                            csv_files.append(os.path.join(subroot, file))
        return csv_files
        
    except Exception as e:
        print(f"Error occurred while searching: {e}")
        return []

def extract_pd_values(filepath_list):
    """
    Extracts PD values (like PD4841a) from file paths, removing 'results_', '.csv' and '+'
    
    Args:
        filepath_list (list): List of file paths
        
    Returns:
        list: List of extracted PD values
    """
    pd_values = []
    for path in filepath_list:
        pd = path.split('/')[-1].replace('results_', '').replace('.csv', '').replace('+', '')
        pd_values.append(pd)
    
    return pd_values


def move_csvs_to_target(filepath_list, target_folder):
    """
    Moves CSV files from source paths to target folder
    
    Args:
        filepath_list (list): List of source CSV file paths
        target_folder (str): Destination folder path
    """
    # Create target folder if it doesn't exist
    os.makedirs(target_folder, exist_ok=True)
    
    for source_path in filepath_list:
        # Get just the filename from the path
        filename = os.path.basename(source_path)
        # Create destination path
        dest_path = os.path.join(target_folder, filename)
        
        try:
            shutil.move(source_path, dest_path)
            print(f"Moved {filename} to {target_folder}")
        except Exception as e:
            print(f"Error moving {source_path}: {e}")

# Example usage:
# move_csvs_to_target(filepath_list, "path/to/target/folder")


In [3]:
res_2 = find_results_csv('downloads/all/results', prefix='2')
downloads_2 = extract_pd_values(res_2)

move_csvs_to_target(res_2, 'downloads/2')

Moved results_PD4116a+.csv to downloads/2
Moved results_PD14465a+.csv to downloads/2
Moved results_PD13623a+.csv to downloads/2
Moved results_PD11396a+.csv to downloads/2
Moved results_PD18269a+.csv to downloads/2
Moved results_PD11348a+.csv to downloads/2
Moved results_PD13298a+.csv to downloads/2
Moved results_PD3890a+.csv to downloads/2
Moved results_PD11372a+.csv to downloads/2
Moved results_PD18730a+.csv to downloads/2
Moved results_PD4194a+.csv to downloads/2
Moved results_PD4006a+.csv to downloads/2
Moved results_PD11366a+.csv to downloads/2
Moved results_PD14450a+.csv to downloads/2
Moved results_PD11385a+.csv to downloads/2
Moved results_PD4103a+.csv to downloads/2
Moved results_PD11761a+.csv to downloads/2
Moved results_PD4115a+.csv to downloads/2
Moved results_PD13755a+.csv to downloads/2
Moved results_PD11336a+.csv to downloads/2
Moved results_PD14432a+.csv to downloads/2
Moved results_PD3904a+.csv to downloads/2
Moved results_PD11757a+.csv to downloads/2
Moved results_PD13

In [19]:
from pathlib import Path

def get_vcf_basenames(folder_path):
    """
    Finds all VCF files in folder and subfolders and returns list of basenames without .vcf extension
    """
    folder = Path(folder_path)
    vcf_list = [f.stem for f in folder.rglob('*.csv')]
    return sorted(vcf_list)


basenames_2 = get_vcf_basenames('downloads/2')

In [3]:
import pandas as pd

df = pd.read_parquet("results/2/results_PD4076a+.parquet")

df.head()

Unnamed: 0,id,wt_prediction,mut_prediction,pred_difference,vcf_id,mirna_accession,gene_id,is_intron,mutation_context,mutsig_key,is_gene_upregulated
0,PD4076a_X_100480810_G_A_MIMAT0000067,0.659048,0.416123,-0.243,PD4076a,MIMAT0000067,ENSG00000102385,True,A[C>T]C,PD4076a_A[C>T]C,True
1,PD4076a_X_100480810_G_A_MIMAT0000082,0.825714,0.47565,-0.35,PD4076a,MIMAT0000082,ENSG00000102385,True,A[C>T]C,PD4076a_A[C>T]C,True
2,PD4076a_X_100480810_G_A_MIMAT0000083,0.854318,0.499602,-0.355,PD4076a,MIMAT0000083,ENSG00000102385,True,A[C>T]C,PD4076a_A[C>T]C,True
3,PD4076a_X_100480810_G_A_MIMAT0000091,0.717076,0.304212,-0.413,PD4076a,MIMAT0000091,ENSG00000102385,True,A[C>T]C,PD4076a_A[C>T]C,True
4,PD4076a_X_100480810_G_A_MIMAT0000094,0.68038,0.444619,-0.236,PD4076a,MIMAT0000094,ENSG00000102385,True,A[C>T]C,PD4076a_A[C>T]C,True
