In [103]:
import pandas as pd
import json

from cmu import cmu_bldg_path
from utils import *

In [55]:
def pmid2pmcid(pid):
    url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=my_tool&email=my_email@example.com&ids={pid}&format=json"
    response = requests.get(url)
    if response.status_code == 200:
        response = response.json()
        if 'records' in response.keys() and  'pmcid' in response['records'][0].keys():
            return response['records'][0]['pmcid']
        else:
            return None
    else:
        return None

In [85]:
def get_raw_data_identifiers(pmcid):
    europmc_url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/PMC/{pmcid}/datalinks?format=json"
    response = requests.get(europmc_url)
    if response.status_code != 200:
        return None
    response = response.json()
    return process_epmc_response(response)

def process_epmc_response(response):
    raw = list()
    if 'dataLinkList' in response.keys():
        category = response['dataLinkList']['Category']    
        for c in category:
            nl = c['NameLong'] if 'NameLong' in c.keys() else ''
            raw.append({c['Name']: [nl, c['Section'][0]['Linklist']['Link'][0]['Target']['Identifier']['ID']]})
    return raw
    
def get_pmids_from_cm():
    url = "https://www.cancermodels.org/api/model_metadata?select=model_id,data_source,provider_name,pubmed_ids"
    df = pd.read_json(url)
    df = df[~df['pubmed_ids'].isna()]
    df['pubmed_ids'] = df['pubmed_ids'].str.split(',')
    df = df.explode('pubmed_ids')
    df['pubmed_ids'] = df['pubmed_ids'].str.strip()
    df = df.drop_duplicates('pubmed_ids')
    df = df[df['pubmed_ids'] != ""]
    df['pmid'] = [pmid.replace('PMID: ', '').replace('PMID:', '') for pmid in df['pubmed_ids']]
    df = df.drop_duplicates('pmid')
    return df

def get_raw_data_ids_for_cm_publications():
    df = get_pmids_from_cm()
    fetched_df = pd.read_csv('raw_identifiers copy.tsv', sep='\t')
    df = df[[False if int(pid) in fetched_df.pmid.unique() else True for pid in df['pmid']]]
    publication_and_raw_data_identifier = list()
    for _, row in df.iterrows():
        pmcid = pmid2pmcid(row['pmid'])
        if pmcid is not None:
            raw = get_raw_data_identifiers(pmcid)
        else:
            raw = None
        publication_and_raw_data_identifier.append([row['data_source'], row['provider_name'], row['pubmed_ids'], row['pmid'], raw])
        publication_and_raw_data_identifier_df = pd.DataFrame(publication_and_raw_data_identifier, columns=['provider','provider_full_name','pubmed_id','pmid','raw'])
        publication_and_raw_data_identifier_df.to_csv('raw_identifiers.tsv', index=False, sep='\t')
    pd.concat([fetched_df, publication_and_raw_data_identifier_df]).to_csv('raw_identifiers.tsv', index=False, sep='\t')
    return pd.concat([fetched_df, publication_and_raw_data_identifier_df])


In [86]:
df  = get_raw_data_ids_for_cm_publications()

In [109]:
raw_identifiers =  pd.read_csv('raw_identifiers copy 2.tsv', sep='\t')

In [113]:
raw_identifiers = df[~df['raw'].isna()]
raw_identifiers = raw_identifiers[[True if r!= '[]' else False for r in raw_identifiers['raw']]]

In [119]:
def string_to_list(value):
    # Replace single quotes with double quotes to make it JSON-compatible
    if isinstance(value, list):
        return value
    value = value.replace("'", '"').replace('Burkitt"s', "Burkitt's")
    # Convert JSON string to Python list
    return json.loads(value)

# Apply the conversion to the column
raw_identifiers['Parsed_Info'] = raw_identifiers['raw'].apply(string_to_list)

In [187]:
raw_identifiers

Unnamed: 0,provider,provider_full_name,pubmed_id,pmid,raw,Parsed_Info
2,CMP,Cell Model Passports,PMID: 26011428,26011428,[{'BioStudies: supplemental material and suppo...,[{'BioStudies: supplemental material and suppo...
4,CMP,Cell Model Passports,PMID: 26589293,26589293,"[{'European Genome-Phenome Archive': ['', 'EGA...","[{'European Genome-Phenome Archive': ['', 'EGA..."
5,CMP,Cell Model Passports,PMID: 22460905,22460905,"[{'GEO': ['GEO - Gene Expression Omnibus', 'GS...","[{'GEO': ['GEO - Gene Expression Omnibus', 'GS..."
7,CMP,Cell Model Passports,PMID: 31068700,31068700,"[{'BioProject': ['BioProject', 'PRJNA523380']}...","[{'BioProject': ['BioProject', 'PRJNA523380']}..."
9,CMP,Cell Model Passports,PMID: 30894373,30894373,"[{'Altmetric': ['', 'https://www.altmetric.com...","[{'Altmetric': ['', 'https://www.altmetric.com..."
...,...,...,...,...,...,...
1793,CMP,Cell Model Passports,PMID: 26719794,26719794,[{'IGSR | samples - 1000 Genomes': ['IGSR: The...,[{'IGSR | samples - 1000 Genomes': ['IGSR: The...
1797,CMP,Cell Model Passports,PMID: 12671075,12671075,"[{'Altmetric': ['', 'https://www.altmetric.com...","[{'Altmetric': ['', 'https://www.altmetric.com..."
1809,CMP,Cell Model Passports,PMID: 2474525,2474525,"[{'Altmetric': ['', 'https://www.altmetric.com...","[{'Altmetric': ['', 'https://www.altmetric.com..."
1811,CMP,Cell Model Passports,PMID: 32847597,32847597,[{'RefSeq': ['RefSeq - NCBI Reference Sequence...,[{'RefSeq': ['RefSeq - NCBI Reference Sequence...


In [136]:
raw_url_resources = set()
raw_url_resources_dict = dict()
for _, row in raw_identifiers.iterrows():
    for raw in row['Parsed_Info']:
        raw_url_resources.add(list(raw.keys())[0])
        if list(raw.keys())[0] not in raw_url_resources_dict.keys():
            raw_url_resources_dict[list(raw.keys())[0]] = list(raw.values())[0][0]   

In [181]:
updated_dict = {
    'BioStudies: supplemental material and supporting data': 'BioStudies',
    'Cellosaurus': 'Cellosaurus',
    'Genes & Proteins': 'G&P',
    'European Genome-Phenome Archive': 'EGA',
    'Altmetric': 'Altmetric',
    'GEO': 'GEO',
    'Faculty Opinions': 'F1000',
    'Functional Genomics Experiments': 'FGE', # https://www.ebi.ac.uk/biostudies/arrayexpress/studies/
    'Nucleotide Sequences': 'ENA',
    'Ximbio': 'Ximbio',
    'BioProject': 'BioProject',
    'Clinical Trials': 'CT', # https://clinicaltrials.gov/study/
    'dbGaP': 'dbGaP',
    'ProteomeXchange': 'ProteomeX',
    'Data Citations': 'DataCite',
    'HAL Open Archive': 'HAL',
    'COSMIC': 'COSMIC',
    'ENCODE: Encyclopedia of DNA Elements': 'ENCODE',
    'GenomeRNAi': 'RNAi',
    'GlyGen glycoinformatics resource': 'GlyGen',
    'Diseases': 'Diseases',
    'Rat Genome Database': 'RGD',
    'GCA': 'GCA',
    'Wikipedia': 'Wiki',
    'Proteomics Data': 'PRIDE',
    'IGSR | samples - 1000 Genomes': 'IGSR', # https://www.internationalgenome.org/data-portal/sample/
    'RefSeq': 'RefSeq',
    'HPA': 'HPA',
    'SNPs': 'SNPs',
    'Ensembl': 'Ensembl',
    'FlyBase': 'FlyBase',
    'iPTMnet': 'iPTMnet',
    'Linköping University Digital Archive': 'LiU',
    'Quick GO': 'QuickGO',
    'Related Immune Epitope Information - Immune Epitope Database and Analysis Resource': 'IEDB', # http://www.iedb.org/pmId/
    'RRID': 'RRID',
    'Mouse Genome Informatics (MGI)': 'MGI',
    'Publons': 'Publons',
    'Protein Structures': 'PDBe',
    'SIGNOR': 'SIGNOR',
    'Dryad Data Platform': 'Dryad',
    'WikiPathways': 'WikiPathways',
    'Chemicals': 'ChEBI',
    'Protein Families': 'InterPro',
    'MetaboLights': 'MetaboLights',
    'Protein Interactions': 'IntAct',
    'GOA Project': 'GOA',
    'Kudos': 'Kudos',
    'ZFIN': 'ZFIN',
    'Pfam': 'Pfam'
}

In [182]:
def process_identifier_values(value):
    value = (value.replace('https://www.altmetric.com/details/', '') # Altmetric
             .replace('http://www.ebi.ac.uk/biostudies/studies/', '') # Biostudies
             .replace('?xr=true', '').replace('https://www.cellosaurus.org/', '') # Cellosaurus
             .replace('http://encodeproject.org/publications/', '') # ENCODE
             .replace('https://rgd.mcw.edu/rgdweb/report/reference/main.html?id=', '') # RGD
             .replace('https://www.glygen.org/publication/MED/', '') # GlyGen
             .replace('http://www.iedb.org/pmId/', '') # IEDB
             
             )
    return value

skip_resources = ['F1000', 'Wiki', 'FlyBase', 'HAL', 'RNAi', 'Diseases', 'GCA', 'Ensembl', 'iPTMnet', 'LiU', 'QuickGO', 'Publons', 'PDBe', 'SIGNOR', 'Pfam', 'ZFIN', 'Kudos', 'WikiPathways', 'ChEBI', 'InterPro', 'MetaboLights', 'IntAct','GOA']
processed_identifiers = pd.DataFrame()
for _, row in raw_identifiers.iterrows():
    identifier_string = ''
    for raw in row['Parsed_Info']:
        key = list(raw.keys())[0]
        resource_id = updated_dict[key]
        if resource_id in skip_resources :
            continue
        value = list(raw.values())[0][1]
        row[resource_id] = value
        value = process_identifier_values(value)
        identifier_string = f"{identifier_string}, {resource_id}: {value}"
    identifier_string = identifier_string[2:]
    row['identifiers'] = identifier_string
    processed_identifiers = pd.concat([processed_identifiers, pd.DataFrame(row).transpose()], ignore_index=True)
processed_identifiers

Unnamed: 0,provider,provider_full_name,pubmed_id,pmid,raw,Parsed_Info,BioStudies,Cellosaurus,G&P,identifiers,...,RGD,PRIDE,IGSR,RefSeq,HPA,SNPs,IEDB,RRID,MGI,Dryad
0,CMP,Cell Model Passports,PMID: 26011428,26011428,[{'BioStudies: supplemental material and suppo...,[{'BioStudies: supplemental material and suppo...,http://www.ebi.ac.uk/biostudies/studies/S-EPMC...,https://www.cellosaurus.org/search?query=26011428,Q96TC6,"BioStudies: S-EPMC4556387, Cellosaurus: search...",...,,,,,,,,,,
1,CMP,Cell Model Passports,PMID: 26589293,26589293,"[{'European Genome-Phenome Archive': ['', 'EGA...","[{'European Genome-Phenome Archive': ['', 'EGA...",http://www.ebi.ac.uk/biostudies/studies/S-EPMC...,https://www.cellosaurus.org/search?query=26589293,,"EGA: EGAD00001000725, Altmetric: 4788089, BioS...",...,,,,,,,,,,
2,CMP,Cell Model Passports,PMID: 22460905,22460905,"[{'GEO': ['GEO - Gene Expression Omnibus', 'GS...","[{'GEO': ['GEO - Gene Expression Omnibus', 'GS...",http://www.ebi.ac.uk/biostudies/studies/S-EPMC...,https://www.cellosaurus.org/search?query=22460905,,"GEO: GSE36139, Altmetric: 670961, BioStudies: ...",...,,,,,,,,,,
3,CMP,Cell Model Passports,PMID: 31068700,31068700,"[{'BioProject': ['BioProject', 'PRJNA523380']}...","[{'BioProject': ['BioProject', 'PRJNA523380']}...",http://www.ebi.ac.uk/biostudies/studies/S-EPMC...,https://www.cellosaurus.org/search?query=31068700,,"BioProject: PRJNA523380, EGA: EGAD00001001039,...",...,,,,,,,,,,
4,CMP,Cell Model Passports,PMID: 30894373,30894373,"[{'Altmetric': ['', 'https://www.altmetric.com...","[{'Altmetric': ['', 'https://www.altmetric.com...",http://www.ebi.ac.uk/biostudies/studies/S-EPMC...,https://www.cellosaurus.org/search?query=30894373,,"Altmetric: 57484396, BioStudies: S-EPMC6445675...",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1070,CMP,Cell Model Passports,PMID: 26719794,26719794,[{'IGSR | samples - 1000 Genomes': ['IGSR: The...,[{'IGSR | samples - 1000 Genomes': ['IGSR: The...,http://www.ebi.ac.uk/biostudies/studies/S-EPMC...,https://www.cellosaurus.org/CVCL_0291,,"IGSR: GM12878, DataCite: 10.5524/100165, Altme...",...,,,GM12878,,,,,,,
1071,CMP,Cell Model Passports,PMID: 12671075,12671075,"[{'Altmetric': ['', 'https://www.altmetric.com...","[{'Altmetric': ['', 'https://www.altmetric.com...",,https://www.cellosaurus.org/CVCL_0291,,"Altmetric: 101954918, Cellosaurus: CVCL_0291",...,,,,,,,,,,
1072,CMP,Cell Model Passports,PMID: 2474525,2474525,"[{'Altmetric': ['', 'https://www.altmetric.com...","[{'Altmetric': ['', 'https://www.altmetric.com...",,https://www.cellosaurus.org/CVCL_3082,,"Altmetric: 41935927, Cellosaurus: CVCL_3082",...,,,,,,,,,,
1073,CMP,Cell Model Passports,PMID: 32847597,32847597,[{'RefSeq': ['RefSeq - NCBI Reference Sequence...,[{'RefSeq': ['RefSeq - NCBI Reference Sequence...,http://www.ebi.ac.uk/biostudies/studies/S-EPMC...,https://www.cellosaurus.org/search?query=32847597,Q3UH37,"RefSeq: NM_014293.4, Altmetric: 89085732, BioS...",...,,,,NM_014293.4,,,,,http://www.informatics.jax.org/reference/32847597,


In [186]:
processed_identifiers[['provider', 'provider_full_name', 'pubmed_id', 'pmid', 'identifiers',
                       'BioStudies', 'Cellosaurus', 'G&P', 'EGA',
                       'Altmetric', 'GEO', 'FGE', 'ENA', 'Ximbio', 'BioProject', 'CT',
                       'dbGaP', 'ProteomeX', 'DataCite', 'COSMIC', 'ENCODE', 'GlyGen', 'RGD',
                       'PRIDE', 'IGSR', 'RefSeq', 'HPA', 'SNPs', 'IEDB', 'RRID', 'MGI', 'Dryad']].to_csv("processed_raw_identifiers.tsv", index=False, sep='\t')

In [188]:
processed_identifiers

Unnamed: 0,provider,provider_full_name,pubmed_id,pmid,raw,Parsed_Info,BioStudies,Cellosaurus,G&P,identifiers,...,RGD,PRIDE,IGSR,RefSeq,HPA,SNPs,IEDB,RRID,MGI,Dryad
0,CMP,Cell Model Passports,PMID: 26011428,26011428,[{'BioStudies: supplemental material and suppo...,[{'BioStudies: supplemental material and suppo...,http://www.ebi.ac.uk/biostudies/studies/S-EPMC...,https://www.cellosaurus.org/search?query=26011428,Q96TC6,"BioStudies: S-EPMC4556387, Cellosaurus: search...",...,,,,,,,,,,
1,CMP,Cell Model Passports,PMID: 26589293,26589293,"[{'European Genome-Phenome Archive': ['', 'EGA...","[{'European Genome-Phenome Archive': ['', 'EGA...",http://www.ebi.ac.uk/biostudies/studies/S-EPMC...,https://www.cellosaurus.org/search?query=26589293,,"EGA: EGAD00001000725, Altmetric: 4788089, BioS...",...,,,,,,,,,,
2,CMP,Cell Model Passports,PMID: 22460905,22460905,"[{'GEO': ['GEO - Gene Expression Omnibus', 'GS...","[{'GEO': ['GEO - Gene Expression Omnibus', 'GS...",http://www.ebi.ac.uk/biostudies/studies/S-EPMC...,https://www.cellosaurus.org/search?query=22460905,,"GEO: GSE36139, Altmetric: 670961, BioStudies: ...",...,,,,,,,,,,
3,CMP,Cell Model Passports,PMID: 31068700,31068700,"[{'BioProject': ['BioProject', 'PRJNA523380']}...","[{'BioProject': ['BioProject', 'PRJNA523380']}...",http://www.ebi.ac.uk/biostudies/studies/S-EPMC...,https://www.cellosaurus.org/search?query=31068700,,"BioProject: PRJNA523380, EGA: EGAD00001001039,...",...,,,,,,,,,,
4,CMP,Cell Model Passports,PMID: 30894373,30894373,"[{'Altmetric': ['', 'https://www.altmetric.com...","[{'Altmetric': ['', 'https://www.altmetric.com...",http://www.ebi.ac.uk/biostudies/studies/S-EPMC...,https://www.cellosaurus.org/search?query=30894373,,"Altmetric: 57484396, BioStudies: S-EPMC6445675...",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1070,CMP,Cell Model Passports,PMID: 26719794,26719794,[{'IGSR | samples - 1000 Genomes': ['IGSR: The...,[{'IGSR | samples - 1000 Genomes': ['IGSR: The...,http://www.ebi.ac.uk/biostudies/studies/S-EPMC...,https://www.cellosaurus.org/CVCL_0291,,"IGSR: GM12878, DataCite: 10.5524/100165, Altme...",...,,,GM12878,,,,,,,
1071,CMP,Cell Model Passports,PMID: 12671075,12671075,"[{'Altmetric': ['', 'https://www.altmetric.com...","[{'Altmetric': ['', 'https://www.altmetric.com...",,https://www.cellosaurus.org/CVCL_0291,,"Altmetric: 101954918, Cellosaurus: CVCL_0291",...,,,,,,,,,,
1072,CMP,Cell Model Passports,PMID: 2474525,2474525,"[{'Altmetric': ['', 'https://www.altmetric.com...","[{'Altmetric': ['', 'https://www.altmetric.com...",,https://www.cellosaurus.org/CVCL_3082,,"Altmetric: 41935927, Cellosaurus: CVCL_3082",...,,,,,,,,,,
1073,CMP,Cell Model Passports,PMID: 32847597,32847597,[{'RefSeq': ['RefSeq - NCBI Reference Sequence...,[{'RefSeq': ['RefSeq - NCBI Reference Sequence...,http://www.ebi.ac.uk/biostudies/studies/S-EPMC...,https://www.cellosaurus.org/search?query=32847597,Q3UH37,"RefSeq: NM_014293.4, Altmetric: 89085732, BioS...",...,,,,NM_014293.4,,,,,http://www.informatics.jax.org/reference/32847597,


In [216]:
identifiers = read_metadata_with_fields('processed_raw_identifiers.tsv')
raw_data_url_df = identifiers[['provider', 'provider_full_name', 'pubmed_id', 'pmid', 'EGA', 'GEO', 'ENA', 'BioProject', 'dbGaP']]
def process_bioproject_ena(row):
    if row['ENA'] != '' and row['BioProject'] != '':
        row['ENA'] = f"{row['ENA']}, {row['BioProject']}"
    elif row['ENA'] == '' and row['BioProject'] != '':
        row['ENA'] = f"{row['BioProject']}"
    return row

raw_data_url_df = raw_data_url_df.fillna('').apply(process_bioproject_ena, axis=1)[['provider', 'provider_full_name', 'pubmed_id', 'pmid', 'EGA', 'GEO', 'ENA', 'dbGaP']]
raw_data_url_df['raw_data_url'] = raw_data_url_df[['EGA', 'GEO', 'ENA', 'dbGaP']].apply(lambda row: ', '.join(filter(None, row)), axis=1)
raw_data_url_df = raw_data_url_df[['provider', 'provider_full_name', 'pubmed_id', 'pmid', 'raw_data_url']]
raw_data_url_df = raw_data_url_df[raw_data_url_df['raw_data_url'] != '']

In [217]:
raw_data_url_df

Unnamed: 0,provider,provider_full_name,pubmed_id,pmid,raw_data_url
1,CMP,Cell Model Passports,PMID: 26589293,26589293,EGAD00001000725
2,CMP,Cell Model Passports,PMID: 22460905,22460905,"GSE36139, PRJNA523380"
3,CMP,Cell Model Passports,PMID: 31068700,31068700,"EGAD00001001039, PRJNA523380, PRJNA523380"
5,SJCRH,St. Jude Children's Research Hospital,PMID: 28854174,28854174,EGAS00001002528
6,GCCRI,UT Health San Antonio Greehey Children's Cance...,PMID: 31693904,31693904,phs000469
...,...,...,...,...,...
1040,IRCCS-HSM-GE,IRCCS Ospedale Policlinico San Martino,PMID: 35819446,35819446,GSE40533
1057,CMP,Cell Model Passports,PMID: 33525507,33525507,GSE89413
1065,CMP,Cell Model Passports,PMID: 33317567,33317567,"GSE41964, PRJNA178621"
1067,CMP,Cell Model Passports,PMID: 34707142,34707142,"GSE184764, PRJNA766076"


In [254]:
def get_pmids_from_cm_for_merge():
    url = "https://www.cancermodels.org/api/model_metadata?select=model_id,data_source,provider_name,pubmed_ids"
    df = pd.read_json(url)
    df = df[~df['pubmed_ids'].isna()]
    df['pubmed_ids'] = df['pubmed_ids'].str.replace('PMID: ', 'PMID:').str.strip()
    return df
'''
    df['pubmed_ids'] = df['pubmed_ids'].str.split(',')
    df = df.explode('pubmed_ids')
    df['pubmed_ids'] = df['pubmed_ids'].str.strip()
    #df = df.drop_duplicates('pubmed_ids')
    df = df[df['pubmed_ids'] != ""]
    df['pmid'] = [pmid.replace('PMID: ', '').replace('PMID:', '') for pmid in df['pubmed_ids']]
    #df = df.drop_duplicates('pmid')
    return df
'''
cm_model_pub = get_pmids_from_cm_for_merge()
#cm_model_pub.pmid = cm_model_pub.pmid.astype(int)
providers = sorted(raw_data_url_df.provider.unique())

In [302]:
def get_rdu_from_pmid(pmid, df):
    pmids = pmid.split(',')
    rdu = ''
    for pm in pmids:
        pm = pm.replace('PMID:', '')
        if pm!= '':
            pm = int(pm)
            t = df[df['pmid'] == pm].reset_index(drop=True)
            if t.shape[0] > 0:
                rdu = f"{rdu}, {t['raw_data_url'][0]}"
    return ", ".join(dict.fromkeys(rdu[1:].split(", ")))
        
    

for provider in providers:
    if provider == 'CMP':
        continue
    temp = cm_model_pub[cm_model_pub['data_source'] == provider]
    temp['raw_data_url'] = ''
    raw_data_temp = raw_data_url_df[raw_data_url_df['provider'] == provider]
    if provider == 'CCC':
        provider = 'TTUHSC'
    if exists(join(home, provider, f'{provider}_molecular_metadata-sample.tsv')):
        sample = read_metadata_with_fields(join(home, provider, f'{provider}_molecular_metadata-sample.tsv'))
        platform = read_metadata_without_fields(join(home, provider, f'{provider}_molecular_metadata-platform.tsv'))[['platform_id', 'molecular_characterisation_type']]
        merged_sample = sample.merge(platform, on='platform_id', how='left').fillna('')
        for _, r in temp.iterrows():
            rdu = get_rdu_from_pmid(r['pubmed_ids'], raw_data_temp)
            temp.at[_, 'raw_data_url'] = rdu
        data_types = ['mutation', 'expression', 'copy number alteration']
        for _, r in merged_sample.iterrows():
            if r['molecular_characterisation_type'] in data_types:
                url = r['raw_data_url']
                df = temp[temp['model_id'] == r['model_id']].reset_index(drop=True)
                if df.shape[0] > 0 and df['raw_data_url'][0] != '':
                    if url == '':
                        url = df['raw_data_url'][0]
                    else:
                        url = f"{url}, {df['raw_data_url'][0]}"
                    url = ", ".join(dict.fromkeys(url.split(", ")))
                merged_sample.at[_, 'raw_data_url'] = url.strip()
        merged_sample = merged_sample[sample.columns]
        merged_sample.to_csv(join(home, provider, f'{provider}_molecular_metadata-sample.tsv'), sep='\t', index=False) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['raw_data_url'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['raw_data_url'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['raw_data_url'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See