In [21]:
from Bio import Entrez
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import csv

## **Collection Data**

# Step 1: Reading and Storing Abstract 

In [3]:
df = pd.read_csv('pubmed_ids.csv')
pubmed_ids = df['PubMed ID'].tolist()

In [5]:
# Set your email address for identification
Entrez.email = "sidesh.sundar@example.com"

In [6]:
# Function to fetch abstract
def fetch_abstract(pubmed_id):
    try:
        handle = Entrez.efetch(db="pubmed", id=str(pubmed_id), retmode="xml")
        record = Entrez.read(handle)
        abstract = record['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText'][0]
        return abstract
    except Exception as e:
        print(f"Error fetching abstract for PubMed ID {pubmed_id}: {str(e)}")
        return None

In [8]:
# Fetch abstracts for all PubMed IDs
pubmed_abstracts = [fetch_abstract(pubmed_id) for pubmed_id in pubmed_ids]

In [9]:
# Filter out None values (failed to fetch abstracts)
pubmed_abstracts = [abstract for abstract in pubmed_abstracts if abstract is not None]

In [10]:
# Create a DataFrame with PubMed IDs and Abstracts
abstract_df = pd.DataFrame({'PubMedID': pubmed_ids, 'Abstract': pubmed_abstracts})

In [None]:
medk=line_df = pd.DataFrame({'PubMedID': pubmed_ids, 'Abstract': pubmed_abstracts})

In [11]:
#to create a csv file
#filename=open(".csv", "w")

In [12]:
# Save the DataFrame to a CSV file named 'abstract.csv'
abstract_df.to_csv('abstract.csv', index=False)

 # Step 2:Target Document Triage

In [16]:
# Calculate abstract lengths
abstract_df['AbstractLength'] = abstract_df['Abstract'].apply(len)

In [17]:
# Sort the DataFrame by abstract length in descending order
triaged_abstract_df = abstract_df.sort_values(by='AbstractLength', ascending=False)

In [18]:
# Save the triaged DataFrame to a CSV file named 'triaged_abstract.csv'
triaged_abstract_df.to_csv('triaged_abstract.csv', index=False)

In [22]:
#to display the data in csv file
'''with open("triaged_abstract.csv","r") as a:
    csvreader = csv.reader(a)
    for i in a:
        print(i)
'''    

'with open("triaged_abstract.csv","r") as a:\n    csvreader = csv.reader(a)\n    for i in a:\n        print(i)\n'

# Step 3: Extract gene from the abstract

In [2]:
# Replace with the actual file paths
gene_file_path = 'GENE.csv'
abstract_file_path = 'abstract.csv'

In [3]:
# Read gene names from the gene file
gene_df = pd.read_csv(gene_file_path)
genes = gene_df['Gene Symbol'].tolist()

In [4]:
# Read abstracts from the abstract file
abstract_df = pd.read_csv(abstract_file_path)
pubmed_ids = abstract_df['PubMedID'].tolist()
abstracts = abstract_df['Abstract'].tolist()

# Step 4 BERT model to tokenize the GENE from the abstract 

In [5]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
# Tokenize and encode the abstracts
tokenized_abstracts = tokenizer(abstracts, return_tensors='pt', truncation=True, padding=True)

In [7]:
# Forward pass through the model
with torch.no_grad():
    outputs = model(**tokenized_abstracts)

In [8]:
# Extract the embeddings for further analysis
embeddings = outputs.last_hidden_state

In [9]:
# Assuming you want to find mentions of each gene in the abstracts
gene_mentions = []

In [10]:
for i, gene in enumerate(genes):
    # Convert gene to string
    gene_str = str(gene)
    
    # Tokenize and encode the gene
    gene_tokens = tokenizer.tokenize(gene_str)
    gene_str = " ".join(gene_tokens)

    for j, (pubmed_id, abstract) in enumerate(zip(pubmed_ids, abstracts)):
        # Tokenize and encode the abstract
        abstract_tokens = tokenizer.tokenize(abstract)
        abstract_str = " ".join(abstract_tokens)

        # Find indices of gene mentions in the abstract
        if gene_str in abstract_str:
            gene_mentions.append({'Gene': gene_str, 'PubMedID': pubmed_id})

In [11]:
# Create a DataFrame with gene mentions
gene_mentions_df = pd.DataFrame(gene_mentions)

In [12]:
# Save the DataFrame to a CSV file named 'gene_mentions.csv'
gene_mentions_df.to_csv('gene_mentions.csv', index=False)

# Step 5: Finetuning the Tokens

##  **Predefined Lexicon Compiling**

In [1]:
# Placeholder file paths
ttd_targets_file_path = 'P1-01-TTD_target_download.txt'
ttd_drugs_file_path = 'P1-02-TTD_drug_download.txt'
drug_disease_mapping_file_path = 'P1-05-Drug_disease.txt'

# Step 1: Parse TTD Target Information

In [2]:
# Read TTD Target Information
with open(ttd_targets_file_path, 'r') as targets_file:
    ttd_targets_data = targets_file.readlines()

In [3]:
# Initialize a list to store cancer-related entities
cancer_related_entities = []

In [4]:
# Parsing logic to extract cancer-related entities
for line in ttd_targets_data:
    # Example: If the line contains the word "cancer," consider it relevant
    if ("cancer" or "melanoma")  in line.lower():
        # Extract relevant information (modify this based on your actual data)
        relevant_info = line.strip()  # Adjust this based on your data structure
        cancer_related_entities.append(relevant_info)

In [5]:
# Output the extracted cancer-related entities
for entity in cancer_related_entities:
    print(entity)

T47101	KEGGPATH	hsa05200:Pathways in cancer
T47101	KEGGPATH	hsa05205:Proteoglycans in cancer
T47101	KEGGPATH	hsa05215:Prostate cancer
T47101	KEGGPATH	hsa05230:Central carbon metabolism in cancer
T47101	WIKIPATH	WP2377:Integrated Pancreatic Cancer Pathway
T59328	KEGGPATH	hsa05200:Pathways in cancer
T59328	KEGGPATH	hsa05205:Proteoglycans in cancer
T59328	KEGGPATH	hsa05206:MicroRNAs in cancer
T59328	KEGGPATH	hsa05212:Pancreatic cancer
T59328	KEGGPATH	hsa05213:Endometrial cancer
T59328	KEGGPATH	hsa05215:Prostate cancer
T59328	KEGGPATH	hsa05219:Bladder cancer
T59328	KEGGPATH	hsa05223:Non-small cell lung cancer
T59328	KEGGPATH	hsa05230:Central carbon metabolism in cancer
T59328	KEGGPATH	hsa05231:Choline metabolism in cancer
T59328	WIKIPATH	WP1984:Integrated Breast Cancer Pathway
T59328	WIKIPATH	WP2363:Gastric cancer network 2
T59328	WIKIPATH	WP2377:Integrated Pancreatic Cancer Pathway
T59328	WIKIPATH	WP2828:Bladder Cancer
T59328	WIKIPATH	WP2868:TCA Cycle Nutrient Utilization and Invasiveness

# Step 2: Parse TTD Drug Information

In [6]:
# Initialize a list to store cancer or melanoma-related entities from drugs
cancer_entities_drug = []

In [7]:
# Read TTD Drug Information
with open(ttd_drugs_file_path, 'r') as drugs_file:
    ttd_drugs_data = drugs_file.readlines()

In [8]:
# Parsing logic to extract cancer or melanoma-related entities from drugs
for line in ttd_drugs_data:
    # Example: If the line contains the words "cancer" or "melanoma," consider it relevant
    if "cancer" in line.lower() or "melanoma" in line.lower():
        # Extract relevant information (modify this based on your actual data)
        relevant_info = line.strip()  # Adjust this based on your data structure
        cancer_entities_drug.append(relevant_info)

In [9]:
# Output the extracted cancer or melanoma-related entities from drugs
for entity in  cancer_entities_drug:
    print(entity)

D00BCG	THERCLAS	Anticancer Agents
D00BVJ	DRUGCOMP	Emd serono; national cancer institute
D00FGO	THERCLAS	Anticancer Agents
D00FHR	THERCLAS	Anticancer Agents
D00GNK	THERCLAS	Anticancer Agents
D00HCQ	THERCLAS	Anticancer Agents
D00HPD	THERCLAS	Anticancer Agents
D00ITT	DRUGCOMP	MEMORIAL SLOAN-KETTERING CANCER CENTER THE ROCKEFELLER UNIVERSITY
D00JJW	THERCLAS	Anticancer Agents
D00MDP	THERCLAS	Anticancer Agents
D00OCB	THERCLAS	Anticancer Agents
D00STL	THERCLAS	Anticancer Agents
D00TXZ	DRUGCOMP	CANCER RESEARCH TECHNOLOGY LIMITED
D00UZR	THERCLAS	Anticancer Agents
D00XEU	THERCLAS	Anticancer Agents
D00YPR	THERCLAS	Anticancer Agents
D00YXG	DRUGCOMP	National Cancer Institute (NCI)
D00ZHR	DRUGCOMP	AntiCancer Inc
D00ZRS	DRUGCOMP	National Cancer Institute
D01AEF	DRUGCOMP	Memorial Sloan-Kettering Cancer Center
D01AHO	THERCLAS	Anticancer Agents
D01AWH	DRUGCOMP	MEMORIAL SLOAN-KETTERING CANCER CENTER THE ROCKEFELLER UNIVERSITY
D01BVU	DRUGCOMP	H. LEE MOFFITT CANCER CENTER AND RESEARCH INSTITUTE TURKSON, Ja

# Step 3: Parse Drug to Disease Mapping

In [10]:
# Initialize a list to store cancer or melanoma-related entities from drugs
cancer_melanoma_dtd = []

In [11]:
# Read TTD Drug Information
with open(ttd_drugs_file_path, 'r') as drugs_file:
    ttd_drugs_data = drugs_file.readlines()

In [12]:
# Parsing logic to extract cancer or melanoma-related entities from drugs
for line in ttd_drugs_data:
    # Example: If the line contains the words "cancer" or "melanoma," consider it relevant
    if "cancer" in line.lower() or "melanoma" in line.lower():
        # Extract relevant information (modify this based on your actual data)
        relevant_info = line.strip()  # Adjust this based on your data structure
        cancer_melanoma_dtd.append(relevant_info)

In [13]:
# Output the extracted cancer or melanoma-related entities from drugs
for entity in cancer_melanoma_dtd:
    print(entity)


D00BCG	THERCLAS	Anticancer Agents
D00BVJ	DRUGCOMP	Emd serono; national cancer institute
D00FGO	THERCLAS	Anticancer Agents
D00FHR	THERCLAS	Anticancer Agents
D00GNK	THERCLAS	Anticancer Agents
D00HCQ	THERCLAS	Anticancer Agents
D00HPD	THERCLAS	Anticancer Agents
D00ITT	DRUGCOMP	MEMORIAL SLOAN-KETTERING CANCER CENTER THE ROCKEFELLER UNIVERSITY
D00JJW	THERCLAS	Anticancer Agents
D00MDP	THERCLAS	Anticancer Agents
D00OCB	THERCLAS	Anticancer Agents
D00STL	THERCLAS	Anticancer Agents
D00TXZ	DRUGCOMP	CANCER RESEARCH TECHNOLOGY LIMITED
D00UZR	THERCLAS	Anticancer Agents
D00XEU	THERCLAS	Anticancer Agents
D00YPR	THERCLAS	Anticancer Agents
D00YXG	DRUGCOMP	National Cancer Institute (NCI)
D00ZHR	DRUGCOMP	AntiCancer Inc
D00ZRS	DRUGCOMP	National Cancer Institute
D01AEF	DRUGCOMP	Memorial Sloan-Kettering Cancer Center
D01AHO	THERCLAS	Anticancer Agents
D01AWH	DRUGCOMP	MEMORIAL SLOAN-KETTERING CANCER CENTER THE ROCKEFELLER UNIVERSITY
D01BVU	DRUGCOMP	H. LEE MOFFITT CANCER CENTER AND RESEARCH INSTITUTE TURKSON, Ja

In [15]:
ttd_disease = cancer_related_entities
ttd_drug = cancer_entities_drug
ttd_dtd = cancer_melanoma_dtd

# Step 4: Compile Lexicon

In [16]:
compiled_lexicon = set()

In [17]:
compiled_lexicon.update(ttd_disease)
compiled_lexicon.update(ttd_drug)
compiled_lexicon.update(ttd_dtd)

In [18]:
for entity in compiled_lexicon:
    print(entity)

T53524	KEGGPATH	hsa05231:Choline metabolism in cancer
T49368	FUNCTION	Negatively regulates B cell proliferation and also has an important function in self-antigen induced B cell tolerance induction. Upon DNA damage, activates the promoter of the death-promoting transcription factor BCLAF1/Btf to trigger BCLAF1-mediated p53/TP53 gene transcription and apoptosis. In response to oxidative stress, interact with and activate CHUK/IKKA in the nucleus, causing the phosphorylation of p53/TP53. In the case of ER stress or DNA damage-induced apoptosis, can form a complex with the tyrosine-protein kinase ABL1 which trigger apoptosis independently of p53/TP53. In cytosol can trigger apoptosis by activating MAPK11 or MAPK14, inhibiting AKT1 and decreasing the level of X-linked inhibitor of apoptosis protein (XIAP), whereas in nucleus induces apoptosis via the activation of MAPK8 or MAPK9. Upon ionizing radiation treatment, is required for the activation of the apoptosis regulators BAX and BAK, whic

# Step 5: Saving results

In [19]:
# Placeholder file path for saving the compiled lexicon
lexicon_csv_file_path = 'lexicon_extraction.csv'

In [29]:
# Save the compiled lexicon to CSV
with open(lexicon_csv_file_path, 'w', newline='') as lexicon_csv_file:
    lexicon_csv_writer = csv.writer(lexicon_csv_file)
     # Write each entity to the CSV file
    for entity in compiled_lexicon:
        lexicon_csv_writer.writerow([entity])

In [28]:
print(f"Compiled lexicon has been saved to {lexicon_csv_file_path}")

Compiled lexicon has been saved to lexicon_extraction.csv
