In [1]:
import pandas as pd
import numpy as np

In [2]:
#load civic data
raw_df = pd.read_csv("civic_raw.tsv", sep='\t')
raw_df['doid'] = raw_df['doid'].fillna(0).astype('int64')
raw_df.head(4)

Unnamed: 0,molecular_profile,molecular_profile_id,disease,doid,phenotypes,therapies,therapy_interaction_type,evidence_type,evidence_direction,evidence_level,...,citation,nct_ids,rating,evidence_status,evidence_id,variant_origin,last_review_date,evidence_civic_url,molecular_profile_civic_url,is_flagged
0,JAK2 V617F,64,Lymphoid Leukemia,1037,,,,Diagnostic,Supports,B,...,"Levine et al., 2005",,4.0,accepted,1,Somatic,2023-01-09 21:46:26 UTC,https://civicdb.org/links/evidence_items/1,https://civicdb.org/links/molecular_profiles/64,False
1,PDGFRA D842V,99,Gastrointestinal Stromal Tumor,9253,,,,Diagnostic,Supports,B,...,"Lasota et al., 2004",,3.0,accepted,2,Somatic,2023-01-09 21:46:27 UTC,https://civicdb.org/links/evidence_items/2,https://civicdb.org/links/molecular_profiles/99,False
2,DNMT3A R882,32,Acute Myeloid Leukemia,9119,,,,Diagnostic,Supports,B,...,"LaRochelle et al., 2011",,2.0,accepted,3,Somatic,2023-01-09 21:46:25 UTC,https://civicdb.org/links/evidence_items/3,https://civicdb.org/links/molecular_profiles/32,False
3,DNMT3A R882,32,Acute Myeloid Leukemia,9119,,,,Diagnostic,Supports,B,...,"Ribeiro et al., 2012",,3.0,accepted,4,Somatic,2023-01-09 21:46:25 UTC,https://civicdb.org/links/evidence_items/4,https://civicdb.org/links/molecular_profiles/32,False


In [3]:
#check column names and delete unnecessary columns
raw_df.columns

Index(['molecular_profile', 'molecular_profile_id', 'disease', 'doid',
       'phenotypes', 'therapies', 'therapy_interaction_type', 'evidence_type',
       'evidence_direction', 'evidence_level', 'significance',
       'evidence_statement', 'citation_id', 'source_type', 'asco_abstract_id',
       'citation', 'nct_ids', 'rating', 'evidence_status', 'evidence_id',
       'variant_origin', 'last_review_date', 'evidence_civic_url',
       'molecular_profile_civic_url', 'is_flagged'],
      dtype='object')

In [4]:
raw_df = raw_df.drop(columns=["phenotypes", "therapies", "therapy_interaction_type", "evidence_direction", "evidence_level", 
                              "significance", "source_type", 'asco_abstract_id', 'citation', 'nct_ids', 'rating', 
                              'evidence_status', 'evidence_id', 'variant_origin', 'last_review_date', 'evidence_civic_url', 
                              'molecular_profile_civic_url', 'is_flagged'])

raw_df.head(4)

Unnamed: 0,molecular_profile,molecular_profile_id,disease,doid,evidence_type,evidence_statement,citation_id
0,JAK2 V617F,64,Lymphoid Leukemia,1037,Diagnostic,JAK2 V617F is not associated with lymphoid leu...,16081687
1,PDGFRA D842V,99,Gastrointestinal Stromal Tumor,9253,Diagnostic,GIST tumors harboring PDGFRA D842V mutation ar...,15146165
2,DNMT3A R882,32,Acute Myeloid Leukemia,9119,Diagnostic,DNMT3A R882 mutations occur most often in de n...,22081665
3,DNMT3A R882,32,Acute Myeloid Leukemia,9119,Diagnostic,Young AML patients (<60 years old) with DNMT3A...,22490330


In [5]:
#drop any possible duplicates
raw_df.drop_duplicates(subset=None, keep="first", inplace=True)
raw_df.shape

(4282, 7)

In [6]:
# Make a dataframe that contains more that one biomarker per row
AND_df = raw_df[raw_df["molecular_profile"].str.contains(" AND | OR ", case=False)]
print(AND_df)
AND_df.shape

                                      molecular_profile  molecular_profile_id  \
64                            BRAF V600E AND BRAF V600M                  4170   
80                    BRAF V600E AND BRAF Amplification                  4173   
124                        ALK EML4::ALK AND ALK L1196M                  4230   
205                        ABL1 BCR::ABL AND ABL1 E255K                  4431   
206                        ABL1 BCR::ABL AND ABL1 T315I                  4373   
...                                                 ...                   ...   
4271  CCND1 Amplification OR CCND2 Amplification OR ...                  4709   
4274                             MSH2 Loss OR MLH1 Loss                  4733   
4277  BRAF V600E OR NRAS Mutation OR HRAS Mutation O...                  4715   
4286  BRAF V600E OR NRAS Mutation OR HRAS Mutation O...                  4748   
4300                     ARID1A Loss OR ARID1A Wildtype                  4882   

                           

(362, 7)

In [7]:
#Make a dataframe to separate the biomarkers
AND1_df = AND_df.copy()
AND2_df = AND_df.copy()
AND3_df = AND_df.copy()
AND4_df = AND_df.copy()
AND5_df = AND_df.copy()
AND6_df = AND_df.copy()

AND3_df.head()

Unnamed: 0,molecular_profile,molecular_profile_id,disease,doid,evidence_type,evidence_statement,citation_id
64,BRAF V600E AND BRAF V600M,4170,Melanoma,1909,Predictive,A single 66-year old male patient with advance...,23031422
80,BRAF V600E AND BRAF Amplification,4173,Colorectal Cancer,9256,Predictive,COLO201 and COLO206F cells harboring BRAF V600...,21098728
124,ALK EML4::ALK AND ALK L1196M,4230,Lung Non-small Cell Carcinoma,3908,Predictive,CH5424802 treatment resulted in significant tu...,21575866
205,ABL1 BCR::ABL AND ABL1 E255K,4431,Chronic Myeloid Leukemia,8552,Predictive,COS7 cell lines transfected with BCR-ABL const...,15194504
206,ABL1 BCR::ABL AND ABL1 T315I,4373,Chronic Myeloid Leukemia,8552,Predictive,In chronic myeloid leukemia patients with the ...,20537386


In [8]:
# Split biomarkers
sep_df = AND_df["molecular_profile"].str.split(" AND | OR ", expand=True)
sep_df.shape

(362, 7)

In [9]:
#Put each biomarker in one dataframe
AND_df["molecular_profile"] = sep_df[0]
AND_df.shape
AND1_df["molecular_profile"] = sep_df[1]
AND1_df.shape
AND2_df["molecular_profile"] = sep_df[2]
AND2_df.shape
AND3_df["molecular_profile"] = sep_df[3]
AND3_df.shape
AND4_df["molecular_profile"] = sep_df[4]
AND4_df.shape
AND5_df["molecular_profile"] = sep_df[5]
AND5_df.shape
AND6_df["molecular_profile"] = sep_df[6]
AND6_df.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  AND_df["molecular_profile"] = sep_df[0]


(362, 7)

In [10]:
#Join dataframes
newsep_df = pd.concat([AND_df, AND1_df, AND2_df, AND3_df, AND4_df, AND5_df, AND6_df], ignore_index=True, sort=False)
newsep_df.head()

Unnamed: 0,molecular_profile,molecular_profile_id,disease,doid,evidence_type,evidence_statement,citation_id
0,BRAF V600E,4170,Melanoma,1909,Predictive,A single 66-year old male patient with advance...,23031422
1,BRAF V600E,4173,Colorectal Cancer,9256,Predictive,COLO201 and COLO206F cells harboring BRAF V600...,21098728
2,ALK EML4::ALK,4230,Lung Non-small Cell Carcinoma,3908,Predictive,CH5424802 treatment resulted in significant tu...,21575866
3,ABL1 BCR::ABL,4431,Chronic Myeloid Leukemia,8552,Predictive,COS7 cell lines transfected with BCR-ABL const...,15194504
4,ABL1 BCR::ABL,4373,Chronic Myeloid Leukemia,8552,Predictive,In chronic myeloid leukemia patients with the ...,20537386


In [11]:
#Delete rows with no biomarker
newsep_df = newsep_df.dropna(subset=["molecular_profile"])
newsep_df.shape

(742, 7)

In [12]:
# Remove rows that contain more than one biomarker
single_df = raw_df[~raw_df["molecular_profile"].str.contains(" AND | OR ")]
single_df.shape

(3925, 7)

In [13]:
#Add rows with the separated biomarkers
single_df = pd.concat([single_df, newsep_df], ignore_index=True, sort=False)
single_df.shape

(4667, 7)

In [14]:
# Eliminate duplicated rows
single_df.drop_duplicates(subset=None, keep="first", inplace=True)
single_df.shape

(4662, 7)

In [15]:
# List rows with biomarkers joined by and/or and download for manual curation
andor_df = single_df[single_df["molecular_profile"].str.contains(" and | or ")]
andor_df.to_csv('andor.tsv', sep='\t', index=False)

In [16]:
#Upload manually separated and/or biomarkers
andor_curated = pd.read_csv('andor_curated.tsv', sep='\t')
andor_curated.head()

Unnamed: 0,molecular_profile,molecular_profile_id,disease,doid,evidence_type,evidence_statement,citation_id
0,NTRK1 LMNA::NTRK1 G595R,1256,Colorectal Adenocarcinoma,50861,Predictive,A gene fusion between exon 10 of NTRK1 and exo...,26546295
1,NTRK1 LMNA::NTRK1 G667C,1256,Colorectal Adenocarcinoma,50861,Predictive,A gene fusion between exon 10 of NTRK1 and exo...,26546295
2,PIK3CA H1047L,3634,Breast Tubular Carcinoma,6587,Prognostic,In a retrospective analysis of 32 Western Chin...,28269754
3,PIK3CA H1047R,3634,Breast Tubular Carcinoma,6587,Prognostic,In a retrospective analysis of 32 Western Chin...,28269754
4,PIK3CA Exon 10 mutation,1867,Colorectal Cancer,9256,Prognostic,"This was a prospective study involving 1,170 c...",22357840


In [17]:
#Eliminate and/or rows from dataset and include newuly separated rows
single_df = single_df[~single_df["molecular_profile"].str.contains(" and | or ")]
single_df = pd.concat([single_df, andor_curated], ignore_index=True, sort=False)
single_df.shape

(4667, 7)

In [18]:
single_df.tail(10)

Unnamed: 0,molecular_profile,molecular_profile_id,disease,doid,evidence_type,evidence_statement,citation_id
4657,NTRK1 LMNA::NTRK1 G595R,1256,Colorectal Adenocarcinoma,50861,Predictive,A gene fusion between exon 10 of NTRK1 and exo...,26546295
4658,NTRK1 LMNA::NTRK1 G667C,1256,Colorectal Adenocarcinoma,50861,Predictive,A gene fusion between exon 10 of NTRK1 and exo...,26546295
4659,PIK3CA H1047L,3634,Breast Tubular Carcinoma,6587,Prognostic,In a retrospective analysis of 32 Western Chin...,28269754
4660,PIK3CA H1047R,3634,Breast Tubular Carcinoma,6587,Prognostic,In a retrospective analysis of 32 Western Chin...,28269754
4661,PIK3CA Exon 10 mutation,1867,Colorectal Cancer,9256,Prognostic,"This was a prospective study involving 1,170 c...",22357840
4662,PIK3CA Exon 21 mutation,1867,Colorectal Cancer,9256,Prognostic,"This was a prospective study involving 1,170 c...",22357840
4663,PIK3CA Exon 10 mutation,3179,Rectum Cancer,1993,Prognostic,This study examined outcomes of 240 rectum can...,19903786
4664,PIK3CA Exon 21 mutation,3179,Rectum Cancer,1993,Prognostic,This study examined outcomes of 240 rectum can...,19903786
4665,IKZF1 IKZF1 deletion,2613,Childhood B-cell Acute Lymphoblastic Leukemia,80146,Prognostic,This study reports on two pediatric cohorts wi...,19129520
4666,IKZF1 IKZF1 mutation,2613,Childhood B-cell Acute Lymphoblastic Leukemia,80146,Prognostic,This study reports on two pediatric cohorts wi...,19129520


In [19]:
# Add the word mutation to rows with only gene and mutation symbols
skip_list = ["mutation", "overexpression", "expression", "amplification", "rearrangement", "loss", "underexpression", 
             "wildtype", "exon", "frameshift", "truncation", "promoter", "hypermethylation", "phosphorylation", "methylation",
             "alteration", "fusion", "gain-of-function", "del", "ins"]
single_df['molecular_profile'] = single_df['molecular_profile'].apply(lambda x: x + ' mutation' 
                                          if len(x.split()) < 3 
                                          and all(word not in x.lower() for word in skip_list)
                                                  else x)
single_df.head()

Unnamed: 0,molecular_profile,molecular_profile_id,disease,doid,evidence_type,evidence_statement,citation_id
0,JAK2 V617F mutation,64,Lymphoid Leukemia,1037,Diagnostic,JAK2 V617F is not associated with lymphoid leu...,16081687
1,PDGFRA D842V mutation,99,Gastrointestinal Stromal Tumor,9253,Diagnostic,GIST tumors harboring PDGFRA D842V mutation ar...,15146165
2,DNMT3A R882 mutation,32,Acute Myeloid Leukemia,9119,Diagnostic,DNMT3A R882 mutations occur most often in de n...,22081665
3,DNMT3A R882 mutation,32,Acute Myeloid Leukemia,9119,Diagnostic,Young AML patients (<60 years old) with DNMT3A...,22490330
4,JAK2 V617F mutation,64,Chronic Myeloid Leukemia,8552,Diagnostic,JAK2 V617F is associated with myeloid malignan...,16081687


In [20]:
#extracting the gene symbol
gene_split = single_df["molecular_profile"].str.split(" ",n=1, expand=True)
single_df["gene"] = gene_split[0]
single_df.head(5)

Unnamed: 0,molecular_profile,molecular_profile_id,disease,doid,evidence_type,evidence_statement,citation_id,gene
0,JAK2 V617F mutation,64,Lymphoid Leukemia,1037,Diagnostic,JAK2 V617F is not associated with lymphoid leu...,16081687,JAK2
1,PDGFRA D842V mutation,99,Gastrointestinal Stromal Tumor,9253,Diagnostic,GIST tumors harboring PDGFRA D842V mutation ar...,15146165,PDGFRA
2,DNMT3A R882 mutation,32,Acute Myeloid Leukemia,9119,Diagnostic,DNMT3A R882 mutations occur most often in de n...,22081665,DNMT3A
3,DNMT3A R882 mutation,32,Acute Myeloid Leukemia,9119,Diagnostic,Young AML patients (<60 years old) with DNMT3A...,22490330,DNMT3A
4,JAK2 V617F mutation,64,Chronic Myeloid Leukemia,8552,Diagnostic,JAK2 V617F is associated with myeloid malignan...,16081687,JAK2


In [21]:
#Download gene col and convert the gene symbol list to gene id with NCBI DAVID conversion tool: https://david.ncifcrf.gov/conversion.jsp
single_df["gene"].to_csv("gene_list.tsv", sep="\t", index=False)

In [22]:
#upload converted list
david_geneid = pd.read_csv("david_geneid.txt", sep='\t')
print(david_geneid.head(4))

    From     To       Species  \
0   ARAF    369  Homo sapiens   
1   CBLC  23624  Homo sapiens   
2  IKZF1  10320  Homo sapiens   
3  ALCAM    214  Homo sapiens   

                                           Gene Name  
0  A-Raf proto-oncogene, serine/threonine kinase(...  
1                         Cbl proto-oncogene C(CBLC)  
2                 IKAROS family zinc finger 1(IKZF1)  
3  activated leukocyte cell adhesion molecule(ALCAM)  


In [23]:
# Merging gene id data
david_geneid = david_geneid.drop(columns=["Species", "Gene Name"])
david_geneid.rename(columns={"From":"gene"}, inplace=True)

df_merged= pd.merge(single_df, david_geneid, on="gene", how="left")
df_merged['To'] = df_merged['To'].fillna(0).astype('Int64')
df_merged.head(4)

Unnamed: 0,molecular_profile,molecular_profile_id,disease,doid,evidence_type,evidence_statement,citation_id,gene,To
0,JAK2 V617F mutation,64,Lymphoid Leukemia,1037,Diagnostic,JAK2 V617F is not associated with lymphoid leu...,16081687,JAK2,3717
1,PDGFRA D842V mutation,99,Gastrointestinal Stromal Tumor,9253,Diagnostic,GIST tumors harboring PDGFRA D842V mutation ar...,15146165,PDGFRA,5156
2,DNMT3A R882 mutation,32,Acute Myeloid Leukemia,9119,Diagnostic,DNMT3A R882 mutations occur most often in de n...,22081665,DNMT3A,1788
3,DNMT3A R882 mutation,32,Acute Myeloid Leukemia,9119,Diagnostic,Young AML patients (<60 years old) with DNMT3A...,22490330,DNMT3A,1788


In [24]:
#Renaming and organizing columns to biomarker data model format
df_merged = df_merged.reindex(columns=df_merged.columns.tolist() + ['exposure_agent', 'exposure_agent_ID', 'specimen', 'specimen_ID', 'loinc_code', "evidence2"])
df_merged2 = df_merged[['molecular_profile', 'gene', 'To', 'disease', 'doid', 'exposure_agent', 'exposure_agent_ID',
       'evidence_type','specimen', 'specimen_ID', 'loinc_code', 'citation_id','evidence_statement', "molecular_profile_id", "evidence2"]]
df_merged2.columns = ['Biomarker', 'assessesed_biomarker_entity', 'assessed_biomarker_entity_ID','condition', 'condition_ID',
                     'exposure_agent', 'exposure_agent_ID', 'best_biomarker_role', 'specimen', 'specimen_ID', 'loinc_code', 'evidence_source','evidence', 'evidence_source2','evidence2']
df_merged2.insert(3, 'assessed_entity_type', "gene")
df_merged2.insert(16, 'tag', 'biomarker;assessed_biomarker_entity_id;assessed_biomarker_entity;condition')

df_merged2.head()

Unnamed: 0,Biomarker,assessesed_biomarker_entity,assessed_biomarker_entity_ID,assessed_entity_type,condition,condition_ID,exposure_agent,exposure_agent_ID,best_biomarker_role,specimen,specimen_ID,loinc_code,evidence_source,evidence,evidence_source2,evidence2,tag
0,JAK2 V617F mutation,JAK2,3717,gene,Lymphoid Leukemia,1037,,,Diagnostic,,,,16081687,JAK2 V617F is not associated with lymphoid leu...,64,,biomarker;assessed_biomarker_entity_id;assesse...
1,PDGFRA D842V mutation,PDGFRA,5156,gene,Gastrointestinal Stromal Tumor,9253,,,Diagnostic,,,,15146165,GIST tumors harboring PDGFRA D842V mutation ar...,99,,biomarker;assessed_biomarker_entity_id;assesse...
2,DNMT3A R882 mutation,DNMT3A,1788,gene,Acute Myeloid Leukemia,9119,,,Diagnostic,,,,22081665,DNMT3A R882 mutations occur most often in de n...,32,,biomarker;assessed_biomarker_entity_id;assesse...
3,DNMT3A R882 mutation,DNMT3A,1788,gene,Acute Myeloid Leukemia,9119,,,Diagnostic,,,,22490330,Young AML patients (<60 years old) with DNMT3A...,32,,biomarker;assessed_biomarker_entity_id;assesse...
4,JAK2 V617F mutation,JAK2,3717,gene,Chronic Myeloid Leukemia,8552,,,Diagnostic,,,,16081687,JAK2 V617F is associated with myeloid malignan...,64,,biomarker;assessed_biomarker_entity_id;assesse...


In [25]:
df_merged2["assessed_biomarker_entity_ID"] = "NCBI:" + df_merged2["assessed_biomarker_entity_ID"].astype(str)
df_merged2["condition_ID"] = "DOID:" + df_merged2["condition_ID"].astype(str)
df_merged2["evidence_source"] = "Pubmed:" + df_merged2["evidence_source"].astype(str)
df_merged2["evidence_source2"] = "CIVIC:" + df_merged2["evidence_source2"].astype(str)

df_merged2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged2["assessed_biomarker_entity_ID"] = "NCBI:" + df_merged2["assessed_biomarker_entity_ID"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged2["condition_ID"] = "DOID:" + df_merged2["condition_ID"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged2["evide

Unnamed: 0,Biomarker,assessesed_biomarker_entity,assessed_biomarker_entity_ID,assessed_entity_type,condition,condition_ID,exposure_agent,exposure_agent_ID,best_biomarker_role,specimen,specimen_ID,loinc_code,evidence_source,evidence,evidence_source2,evidence2,tag
0,JAK2 V617F mutation,JAK2,NCBI:3717,gene,Lymphoid Leukemia,DOID:1037,,,Diagnostic,,,,Pubmed:16081687,JAK2 V617F is not associated with lymphoid leu...,CIVIC:64,,biomarker;assessed_biomarker_entity_id;assesse...
1,PDGFRA D842V mutation,PDGFRA,NCBI:5156,gene,Gastrointestinal Stromal Tumor,DOID:9253,,,Diagnostic,,,,Pubmed:15146165,GIST tumors harboring PDGFRA D842V mutation ar...,CIVIC:99,,biomarker;assessed_biomarker_entity_id;assesse...
2,DNMT3A R882 mutation,DNMT3A,NCBI:1788,gene,Acute Myeloid Leukemia,DOID:9119,,,Diagnostic,,,,Pubmed:22081665,DNMT3A R882 mutations occur most often in de n...,CIVIC:32,,biomarker;assessed_biomarker_entity_id;assesse...
3,DNMT3A R882 mutation,DNMT3A,NCBI:1788,gene,Acute Myeloid Leukemia,DOID:9119,,,Diagnostic,,,,Pubmed:22490330,Young AML patients (<60 years old) with DNMT3A...,CIVIC:32,,biomarker;assessed_biomarker_entity_id;assesse...
4,JAK2 V617F mutation,JAK2,NCBI:3717,gene,Chronic Myeloid Leukemia,DOID:8552,,,Diagnostic,,,,Pubmed:16081687,JAK2 V617F is associated with myeloid malignan...,CIVIC:64,,biomarker;assessed_biomarker_entity_id;assesse...


In [26]:
# Duplicate the dataset and add one type of evidence to each duplicate
pubmed_df1 = df_merged2.drop(columns=["evidence_source2", "evidence2"])
civic_df2 = df_merged2.drop(columns=["evidence_source", "evidence"])
civic_df2 = civic_df2.rename(columns = {"evidence_source2":"evidence_source", "evidence2":"evidence"})

civic_df2.head()

Unnamed: 0,Biomarker,assessesed_biomarker_entity,assessed_biomarker_entity_ID,assessed_entity_type,condition,condition_ID,exposure_agent,exposure_agent_ID,best_biomarker_role,specimen,specimen_ID,loinc_code,evidence_source,evidence,tag
0,JAK2 V617F mutation,JAK2,NCBI:3717,gene,Lymphoid Leukemia,DOID:1037,,,Diagnostic,,,,CIVIC:64,,biomarker;assessed_biomarker_entity_id;assesse...
1,PDGFRA D842V mutation,PDGFRA,NCBI:5156,gene,Gastrointestinal Stromal Tumor,DOID:9253,,,Diagnostic,,,,CIVIC:99,,biomarker;assessed_biomarker_entity_id;assesse...
2,DNMT3A R882 mutation,DNMT3A,NCBI:1788,gene,Acute Myeloid Leukemia,DOID:9119,,,Diagnostic,,,,CIVIC:32,,biomarker;assessed_biomarker_entity_id;assesse...
3,DNMT3A R882 mutation,DNMT3A,NCBI:1788,gene,Acute Myeloid Leukemia,DOID:9119,,,Diagnostic,,,,CIVIC:32,,biomarker;assessed_biomarker_entity_id;assesse...
4,JAK2 V617F mutation,JAK2,NCBI:3717,gene,Chronic Myeloid Leukemia,DOID:8552,,,Diagnostic,,,,CIVIC:64,,biomarker;assessed_biomarker_entity_id;assesse...


In [27]:
#Join civic and pubmed dfs
civic_data = pd.concat([pubmed_df1, civic_df2], ignore_index=True, sort=False)

civic_data.shape

(9334, 15)

In [28]:
#Drop duplicates
civic_data = civic_data.drop_duplicates()
civic_data.shape

(7396, 15)

In [29]:
civic_data.to_csv("civic_data.tsv", sep= "\t", index=False)