In [1]:
from pathlib import Path
import pandas as pd

def merge_multisite(dataset):
    paths = Path(f"data/{dataset}").glob("*/*.tsv")
    df_dataset = pd.DataFrame()
    for f in paths:
        task = f.parts[-1].split("_")[0].split("-")[-1]
        df = pd.read_csv(f, sep="\t", index_col=0, converters={'participant_id': str})
        df["dataset"] = dataset
        df["site"] = f.parts[-2]
        df["task"] = task
        df_dataset = pd.concat([df_dataset, df])
    df_dataset.to_csv(f"data/{dataset}_qcreport.tsv", sep="\t")
    return df_dataset


def merge_multisubject(dataset):
    paths = Path(f"data/{dataset}").glob("*/*/task-*_report.tsv")
    df_dataset = pd.DataFrame()
    for f in paths:
        task = f.parts[-1].split("_")[0].split("-")[-1]
        df = pd.read_csv(f, sep="\t", index_col=0, converters={'participant_id': str})
        df["dataset"] = dataset
        df["task"] = task
        df_dataset = pd.concat([df_dataset, df])
    df_dataset.to_csv(f"data/{dataset}_qcreport.tsv", sep="\t")
    return df_dataset


def merge_multitask(dataset):
    paths = Path(f"data/{dataset}").glob("task-*_report.tsv")
    df_dataset = pd.DataFrame()
    for f in paths:
        task = f.parts[-1].split("_")[0].split("-")[-1]
        df = pd.read_csv(f, sep="\t", index_col=0, converters={'participant_id': str})
        df["dataset"] = dataset
        df["task"] = task
        df_dataset = pd.concat([df_dataset, df])
    df_dataset.to_csv(f"data/{dataset}_qcreport.tsv", sep="\t")
    return df_dataset

# abide1 = merge_multisite("abide1")
# abide2 = merge_multisite("abide2")
# adhd200 = merge_multisite("adhd200")
# cimaq = merge_multisubject("cimaq")
# oasis3 = merge_multisubject("oasis3")
# adni = merge_multisubject("adni")
# srpbs = merge_multisubject("srpbs")
# ds000030 = merge_multitask("ds000030")
# hcpep = merge_multisubject("hcpep") 
# compassnd = merge_multisubject("compassnd")

In [2]:
import shutil

#shutil.copy("data/cobre/task-rest_report.tsv", "data/cobre_qcreport.tsv")
#shutil.copy("data/ukbb/task-rest_scrubbing-0.5_report.tsv", "data/ukbb_qcreport.tsv")

In [10]:
import seaborn as sns

data = Path("data").glob("*.tsv")

df = pd.DataFrame()
for f in data:
    tmp = pd.read_csv(f, sep="\t", index_col=0, converters={'participant_id': str})
    if "ukbb" in f.name:
        tmp["dataset"] = "ukbb"
        tmp = tmp.drop(columns=['Unnamed: 0'])
        tmp.set_index('identifier', inplace=True) # handled slightly differently due to extra unnamed column
    if "cobre" in f.name:
        tmp["dataset"] = "cobre"
    df = pd.concat([df, tmp])
rest_df = df[df["task"].str.contains("rest")]

In [11]:
rest_df["dataset"].unique()

array(['abide1', 'adni', 'oasis3', 'cobre', 'cimaq', 'adhd200', 'hcpep',
       'srpbs', 'ds000030', 'compassnd', 'ukbb', 'abide2'], dtype=object)

In [12]:
rest_df[["dataset", "pass_func_qc", "proportion_kept", "mean_fd_raw"]].groupby("dataset").mean()

Unnamed: 0_level_0,pass_func_qc,proportion_kept,mean_fd_raw
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
abide1,0.945374,0.937643,0.199775
abide2,0.930831,0.926014,0.231814
adhd200,0.908692,0.912412,0.332399
adni,0.701646,0.899972,0.247926
cimaq,0.85974,0.932282,0.228227
cobre,0.824324,0.798953,0.360495
compassnd,0.875635,0.931864,0.245436
ds000030,0.967871,0.939766,0.187552
hcpep,0.983075,0.974867,0.175951
oasis3,0.74954,0.874184,0.296304


In [13]:
rest_df.loc[rest_df['dataset'] == 'adni', 'ses'] = rest_df.loc[rest_df['dataset'] == 'adni', 'ses'].astype(str).str.replace(r'\.0$', '') # at some point adni sessions became floats

  rest_df.loc[rest_df['dataset'] == 'adni', 'ses'] = rest_df.loc[rest_df['dataset'] == 'adni', 'ses'].astype(str).str.replace(r'\.0$', '') # at some point adni sessions became floats


In [14]:
rest_df['ses'] = rest_df['ses'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rest_df['ses'] = rest_df['ses'].astype(str)


In [15]:
rest_df = rest_df[rest_df['task'] == 'rest'] # because OASIS3 restMB4 scans were processed but we don't want those

In [16]:
rest_df.to_csv('rest_df.tsv', sep="\t") 