In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
!pip install biopython



In [None]:
!pip install transformers



In [None]:
!pip install spacy



In [None]:
!pip install nlpaug



In [None]:
from Bio import Entrez
import pandas as pd
import spacy
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import random
from transformers import BertTokenizer, BertModel
import torch
import csv

## **Collection Data**

# Step 1: Reading and Storing Abstract

In [None]:
df = pd.read_csv('/content/drive/MyDrive/BIO/pubmed_ids.csv')
pubmed_ids = df['PubMed ID'].tolist()

In [None]:
# Set your email address for identification
Entrez.email = "sidesh.sundar@example.com"

In [None]:
# Function to fetch abstract
def fetch_abstract(pubmed_id):
    try:
        handle = Entrez.efetch(db="pubmed", id=str(pubmed_id), retmode="xml")
        record = Entrez.read(handle)
        abstract = record['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText'][0]
        return abstract
    except Exception as e:
        print(f"Error fetching abstract for PubMed ID {pubmed_id}: {str(e)}")
        return None

In [None]:
# Fetch abstracts for all PubMed IDs
pubmed_abstracts = [fetch_abstract(pubmed_id) for pubmed_id in pubmed_ids]

In [None]:
# Filter out None values (failed to fetch abstracts)
pubmed_abstracts = [abstract for abstract in pubmed_abstracts if abstract is not None]

In [None]:
# Create a DataFrame with PubMed IDs and Abstracts
abstract_df = pd.DataFrame({'PubMedID': pubmed_ids, 'Abstract': pubmed_abstracts})

In [None]:
medk=line_df = pd.DataFrame({'PubMedID': pubmed_ids, 'Abstract': pubmed_abstracts})

In [None]:
#to create a csv file
#filename=open(".csv", "w")

In [None]:
# Save the DataFrame to a CSV file named 'abstract.csv'
abstract_df.to_csv('/content/drive/MyDrive/BIOBERT/abstract.csv', index=False)

 # Step 2:Target Document Triage

In [None]:
# Calculate abstract lengths
abstract_df['AbstractLength'] = abstract_df['Abstract'].apply(len)

In [None]:
# Sort the DataFrame by abstract length in descending order
triaged_abstract_df = abstract_df.sort_values(by='AbstractLength', ascending=False)

In [None]:
# Save the triaged DataFrame to a CSV file named 'triaged_abstract.csv'
triaged_abstract_df.to_csv('/content/drive/MyDrive/BIO/triaged_abstract.csv', index=False)

In [None]:
#to display the data in csv file
'''with open("triaged_abstract.csv","r") as a:
    csvreader = csv.reader(a)
    for i in a:
        print(i)
'''

'with open("triaged_abstract.csv","r") as a:\n    csvreader = csv.reader(a)\n    for i in a:\n        print(i)\n'


# Step 3: Extract gene from the abstract

In [None]:
# Replace with the actual file paths
gene_file_path = '/content/drive/MyDrive/BIO/GENE.csv'
abstract_file_path = '/content/drive/MyDrive/BIO/abstract.csv'

In [None]:
# Read gene names from the gene file
gene_df = pd.read_csv(gene_file_path)
genes = gene_df['Gene Symbol'].tolist()

In [None]:
# Read abstracts from the abstract file
abstract_df = pd.read_csv(abstract_file_path)
pubmed_ids = abstract_df['PubMedID'].tolist()
abstracts = abstract_df['Abstract'].tolist()

# Step 4 BERT model to tokenize the GENE from the abstract

In [None]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Tokenize and encode the abstracts
tokenized_abstracts = tokenizer(abstracts, return_tensors='pt', truncation=True, padding=True)

In [None]:
# Forward pass through the model
with torch.no_grad():
    outputs = model(**tokenized_abstracts)

In [None]:
# Extract the embeddings for further analysis
embeddings = outputs.last_hidden_state

In [None]:
# Assuming you want to find mentions of each gene in the abstracts
gene_mentions = []

In [None]:
for i, gene in enumerate(genes):
    # Convert gene to string
    gene_str = str(gene)

    # Tokenize and encode the gene
    gene_tokens = tokenizer.tokenize(gene_str)
    gene_str = " ".join(gene_tokens)

    for j, (pubmed_id, abstract) in enumerate(zip(pubmed_ids, abstracts)):
        # Tokenize and encode the abstract
        abstract_tokens = tokenizer.tokenize(abstract)
        abstract_str = " ".join(abstract_tokens)

        # Find indices of gene mentions in the abstract
        if gene_str in abstract_str:
            gene_mentions.append({'Gene': gene_str, 'PubMedID': pubmed_id})

In [None]:
# Create a DataFrame with gene mentions
gene_mentions_df = pd.DataFrame(gene_mentions)

In [None]:
# Save the DataFrame to a CSV file named 'gene_mentions.csv'
gene_mentions_df.to_csv('gene_mentions.csv', index=False)

# Step 5: Finetuning the Tokens

In [None]:
Drimport pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('gene_mentions.csv')  # Replace with your file path

# Removing '##' and spaces from the 'Gene' column and converting to uppercase
df['Gene'] = df['Gene'].str.replace('##', '').str.replace(' ', '').str.upper()

# Grouping by 'Gene' and aggregating 'PubMedID' into a list
grouped_df = df.groupby('Gene')['PubMedID'].apply(list).reset_index()

# Print the grouped DataFrame
print(grouped_df)


## **Lexicon Extraction**

# Step 1: Parse TTD Target Information

In [None]:
# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# List of cancer and melanoma-related terms
cancer_terms = [
    "cancer", "melanoma", "oncology", "tumor", "carcinoma", "neoplasm", "malignancy",
    "chemotherapy", "radiation therapy", "immunotherapy", "oncologist", "metastasis",
    "biopsy", "lymphoma", "leukemia", "sarcoma", "malignant", "benign",
    "radiology", "chemo", "radiotherapy", "hormone therapy", "stem cell transplant",
    "precision medicine", "clinical trial", "cancer research", "BRCA1", "BRCA2",
    "mammogram", "prostate cancer", "breast cancer", "lung cancer", "colorectal cancer",
    "skin cancer", "pancreatic cancer", "ovarian cancer", "thyroid cancer",
    "leukemia", "lymph node", "metastatic", "pathology", "oncogenic", "adenocarcinoma",
    "squamous cell carcinoma", "biopsy", "tumor marker", "palliative care",
    "radiosurgery", "cytotoxic", "angiogenesis", "apoptosis", "gene therapy",
    "cancer prevention", "cancer symptoms", "tumor suppressor genes", "oncogenes",
    "metastasize", "carcinogenesis", "cancer vaccine", "immunotherapy", "targeted therapy"
]


In [None]:
def extract_cancer_entities(file_path_abstract, text_column='text'):
    # Read the CSV file
    data = pd.read_csv(file_path_abstract)

    # Initialize a list to store cancer-related entities
    cancer_entities = []

    # Iterate over each row in the DataFrame
    for text in data[text_column]:
        # Process the text through the NLP model
        doc = nlp(text)

        # Extract entities using NER and add to the list
        for ent in doc.ents:
            if any(term in ent.text.lower() for term in cancer_terms):
                cancer_entities.append(ent.text)

        # Additionally, check for predefined cancer terms in the text
        for term in cancer_terms:
            if term in text.lower():
                cancer_entities.append(term)

    return cancer_entities

In [None]:
# Example usage
file_path_abstract = '/content/drive/MyDrive/BIO/abstract.csv'
cancer_entities = extract_cancer_entities(file_path_abstract, text_column='Abstract')
print(cancer_entities)

['cancer', 'melanoma', 'tumor', 'leukemia', 'malignant', 'leukemia', 'cancer', 'melanoma', 'tumor', 'sarcoma', 'melanoma', 'tumor', 'melanoma', 'tumor', 'radiation therapy', 'melanomas', 'cancer', 'melanoma', 'tumor', 'metastasis', 'malignant', 'skin cancer', 'metastasize', 'melanoma', 'cancer', 'melanoma', 'carcinoma', 'metastatic', 'adenocarcinoma', 'cancer', 'melanoma', 'tumor', 'metastatic', 'melanoma', 'neoplasm', 'melanomas', 'cancer', 'melanoma', 'tumor', 'malignancy', 'metastatic', 'cancer', 'melanoma', 'metastasis', 'metastatic', 'cancer', 'melanoma', 'tumor', 'metastasis', 'metastatic', 'oncogenic', 'angiogenesis', 'oncogenes', 'cancer', 'melanoma', 'tumor', 'oncogenic', 'melanoma', 'tumor', 'metastasis', 'metastatic', 'melanomas', 'melanoma', 'melanoma', 'malignant', 'cancer', 'melanoma', 'oncology', 'tumor', 'metastasis', 'malignant', 'breast cancer', 'metastatic', 'melanoma', 'cancer', 'melanoma', 'tumor', 'metastasis', 'breast cancer', 'Melanoma', 'cancer', 'melanoma', 'c

## **Triggered Word Extraction**

In [None]:
trigger_words = [
    "gene", "mutation", "genotype", "polymorphism", "allele",
    "tumor", "metastasis", "lesion", "necrosis", "invasion", "progression",
    "chemotherapy", "immunotherapy", "radiation", "inhibitor", "vaccine",
    "pain", "swelling", "inflammation", "itching", "discoloration",
    "biopsy", "MRI", "ultrasound", "screening", "endoscopy",
    "survival", "remission", "recurrence", "mortality", "risk",
    "cytokine", "receptor", "enzyme", "antibody", "signaling",
    "immune", "vascular", "tissue", "cellular", "organ",
    "prevalence", "incidence", "epidemic", "outbreak", "cohort",
    "resistant", "responsive", "effective", "adverse", "toxicity"
]

In [None]:
def extract_cancer_entities_and_triggers(file_path_abstract, text_column='text'):
    data = pd.read_csv(file_path_abstract)
    cancer_entities = []
    found_triggers = []

    for text in data[text_column]:
        doc = nlp(text)

        for ent in doc.ents:
            cancer_entities.append(ent.text)

        for trigger in trigger_words:
            if trigger in text.lower():
                found_triggers.append(trigger)

    min_len = min(len(cancer_entities), len(found_triggers))
    cancer_entities = cancer_entities[:min_len]
    found_triggers = found_triggers[:min_len]

    return cancer_entities, found_triggers

In [None]:
cancer_entities, triggers = extract_cancer_entities_and_triggers(file_path_abstract, text_column='Abstract')
results_df = pd.DataFrame({'Cancer Entities': cancer_entities, 'Triggers': triggers})
print(results_df)

In [None]:
output_file_path = '/content/drive/MyDrive/BIO/Triggered_words.csv'
results_df.to_csv(output_file_path, index=False)

##  **Feature Extraction**

In [None]:
# Read your files
diseases_content = read_text_file('/content/drive/MyDrive/BIO/P1-01-TTD_target_download.txt')
drugs_content = read_text_file('/content/drive/MyDrive/BIO/P1-05-Drug_disease.txt')
relations_content = read_text_file('/content/drive/MyDrive/BIO/P1-05-Drug_disease.txt')


In [None]:
cancer_keywords = [
    "cancer", "melanoma", "oncology", "tumor", "carcinoma", "neoplasm", "malignancy",
    "chemotherapy", "radiation therapy", "immunotherapy", "oncologist", "metastasis",
    "biopsy", "lymphoma", "leukemia", "sarcoma", "malignant", "benign",
    "radiology", "chemo", "radiotherapy", "hormone therapy", "stem cell transplant",
    "precision medicine", "clinical trial", "cancer research", "BRCA1", "BRCA2",
    "mammogram", "prostate cancer", "breast cancer", "lung cancer", "colorectal cancer",
    "skin cancer", "pancreatic cancer", "ovarian cancer", "thyroid cancer",
    "leukemia", "lymph node", "metastatic", "pathology", "oncogenic", "adenocarcinoma",
    "squamous cell carcinoma", "biopsy", "tumor marker", "palliative care",
    "radiosurgery", "cytotoxic", "angiogenesis", "apoptosis", "gene therapy",
    "cancer prevention", "cancer symptoms", "tumor suppressor genes", "oncogenes",
    "metastasize", "carcinogenesis", "cancer vaccine", "immunotherapy", "targeted therapy"
]


In [None]:
def read_and_extract_info(file_path, keywords):
    extracted_info = []
    try:
        with open(file_path, 'r') as file:
            for line in file:
                if any(keyword.lower() in line.lower() for keyword in keywords):
                    extracted_info.append(line.strip())
    except FileNotFoundError:
        print("File not found. Please check the file path.")
    except Exception as e:
        print(f"An error occurred: {e}")
    return extracted_info

# Usage
file_path = '/content/drive/MyDrive/BIO/P1-01-TTD_target_download.txt'  # Replace with your file path
keywords = cancer_keywords
information = read_and_extract_info(file_path, keywords)

# Convert the extracted information to a dictionary with the first element as the key
info_dict = {}
for info in information:
    elements = info.split('\t')
    key = elements[0]
    if key not in info_dict:
        info_dict[key] = set()  # Using a set to remove duplicates
    info_dict[key].add(tuple(elements[1:]))

# Printing the grouped information without duplicates
'''for key, values in info_dict.items():
    print(f"{key}: {list(values)}")'''


# Provide the file path for the CSV output
output_file = '/content/drive/MyDrive/BIO/target.csv'

# Writing the extracted information to a CSV file
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['TargetID', 'Path', 'Type'])  # Write header row

    for key, values in info_dict.items():
        for value in values:
            csv_writer.writerow([key] + list(value))

print(f"Data has been successfully saved to {output_file}")

In [None]:
def read_and_extract_info(file_path, keywords):
    extracted_info = []
    try:
        with open(file_path, 'r') as file:
            for line in file:
                if any(keyword.lower() in line.lower() for keyword in keywords):
                    extracted_info.append(line.strip())
    except FileNotFoundError:
        print("File not found. Please check the file path.")
    except Exception as e:
        print(f"An error occurred: {e}")
    return extracted_info

# Usage
file_path ='/content/drive/MyDrive/BIO/P1-02-TTD_drug_download.txt'  # Replace with your file path
keywords = cancer_keywords
information = read_and_extract_info(file_path, keywords)

# Convert the extracted information to a dictionary with the first element as the key
info_dict = {}
for info in information:
    elements = info.split('\t')
    key = elements[0]
    if key not in info_dict:
        info_dict[key] = set()  # Using a set to remove duplicates
    info_dict[key].add(tuple(elements[1:]))

# Remove specific value ('DRUGCLAS', 'Clinical Trial Drug(s)')
unwanted_value = ('DRUGCLAS', 'Clinical Trial Drug(s)')
for key, values in info_dict.items():
    if unwanted_value in values:
        values.remove(unwanted_value)

# Printing the grouped information without empty lists
filtered_info_dict = {key: values for key, values in info_dict.items() if values}

'''for key, values in filtered_info_dict.items():
    print(f"{key}: {list(values)}")'''

# Provide the file path for the CSV output
output_file = '/content/drive/MyDrive/BIO/drug.csv'

# Writing the extracted information to a CSV file
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['DrugID', 'Comp/Type','Usedby'])  # Write header row

    for key, values in info_dict.items():
        for value in values:
            csv_writer.writerow([key] + list(value))

print(f"Data has been successfully saved to {output_file}")


Combined information saved to /content/drive/MyDrive/BIO/combined_info.csv


## **Relation Extraction**

In [None]:
import pandas as pd

def read_and_extract_info(file_path, keywords):
    extracted_info = {}
    try:
        with open(file_path, 'r') as file:
            for line in file:
                if any(keyword.lower() in line.lower() for keyword in keywords):
                    elements = line.strip().split('\t')
                    if elements[0] == 'TTDDRUID' or elements[0] == 'DRUGNAME':
                        key = elements[0]
                        value = elements[1]
                        if key in extracted_info:
                            extracted_info[key].append(value)
                        else:
                            extracted_info[key] = [value]
    except FileNotFoundError:
        print("File not found. Please check the file path.")
    except Exception as e:
        print(f"An error occurred: {e}")
    return extracted_info

# Simulating file read by providing the sample_input as a string
file_path = '/content/drive/MyDrive/BIO/P1-05-Drug_disease.txt'  # This is not the actual file path, but an example
keywords = ['TTDDRUID', 'DRUGNAME']  # Define the keywords of interest
information = read_and_extract_info(file_path, keywords)

# Convert dictionary to DataFrame
df = pd.DataFrame(information)

# Save DataFrame to CSV file
csv_file_path = '/content/drive/MyDrive/BIO/ID_DRUG.csv'  # Path to save CSV file
df.to_csv(csv_file_path, index=False)

print(f"CSV file saved to: {csv_file_path}")

In [None]:
import pandas as pd

# File paths
drug_path = '/content/drive/MyDrive/BIO/drug.csv'
target_path = '/content/drive/MyDrive/BIO/target.csv'
mapping_path = '/content/drive/MyDrive/BIO/P1-07-Drug-TargetMapping.xlsx'

# Load data into dataframes
drug_data = pd.read_csv(drug_path)
target_data = pd.read_csv(target_path, error_bad_lines=False)
mapping_data = pd.read_excel(mapping_path)

# Extract 'TargetID' and 'DrugID' from the mapping data
target_drug_mapping = {}
for index, row in mapping_data.iterrows():
    target_id = row['TargetID']
    drug_id = row['DrugID']
    target_drug_mapping[drug_id] = target_id

# Renaming keys in the dictionary
renamed_target_drug_mapping = {
    'DRUGID': list(target_drug_mapping.keys()),
    'TARGETID': list(target_drug_mapping.values())
}

# Creating a DataFrame from the renamed_target_drug_mapping dictionary
df = pd.DataFrame(renamed_target_drug_mapping)
output_csv_path = '/content/drive/MyDrive/BIO/TARGET_DRUG.csv'

# Save the DataFrame to a CSV file
df.to_csv(output_csv_path, index=False)

print(f"Data has been saved to {output_csv_path}")

## **Repurposed Drug Prioritization**



In [None]:
import pandas as pd

# Load the CSV files containing the extracted information
csv_file_path_1 = '/content/drive/MyDrive/BIO/TARGET_DRUG.csv'
csv_file_path_2 = '/content/drive/MyDrive/BIO/ID_DRUG.csv'

# Read CSV files into DataFrames
df1 = pd.read_csv(csv_file_path_1)
df2 = pd.read_csv(csv_file_path_2)

# Merge DataFrames on 'TTDDRUID'/'DRUGID'
merged_df = pd.merge(df1, df2, left_on='DRUGID', right_on='TTDDRUID', how='inner')

# Select specific columns 'DRUGID', 'TARGETID', 'DRUGNAME' from the merged DataFrame
selected_columns = ['DRUGID', 'TARGETID', 'DRUGNAME']
merged_selected_columns = merged_df[selected_columns]

# Display the selected columns from the merged DataFrame
print(merged_selected_columns.head())

# Display or save the selected columns to a new CSV file
merged_csv_file_path = '/content/drive/MyDrive/BIO/DRUG_ID_TARGET.csv'  # Path to save the selected columns CSV file
merged_selected_columns.to_csv(merged_csv_file_path, index=False)

print(f"Selected columns saved to: {merged_csv_file_path}")

In [None]:
import pandas as pd

def read_and_extract_info(file_path):
    target_ids = []
    gene_names = []
    current_target_id = None

    try:
        with open(file_path, 'r') as file:
            for line in file:
                elements = line.strip().split('\t')
                if len(elements) > 2:
                    if elements[1] == 'TARGETID':
                        current_target_id = elements[2]  # Update the current TARGETID
                    elif elements[1] == 'GENENAME' and current_target_id:
                        target_ids.append(current_target_id)
                        gene_names.append(elements[2])
                        current_target_id = None  # Reset current TARGETID for the next record

    except FileNotFoundError:
        print("File not found. Please check the file path.")
    except Exception as e:
        print(f"An error occurred: {e}")

    return target_ids, gene_names

# Replace with the actual file path
file_path = '/content/drive/MyDrive/BIO/P1-01-TTD_target_download.txt'
target_ids, gene_names = read_and_extract_info(file_path)

# Convert lists to DataFrame
df = pd.DataFrame({'TARGETID': target_ids, 'GENENAME': gene_names})

# Define the CSV file path (change this to your desired file path and name)
csv_file_path = '/content/drive/MyDrive/BIO/ID_GENE.csv'

# Save to CSV
# Change mode to 'w' for writing a new file or 'a' for appending to an existing file
df.to_csv(csv_file_path, mode='a', index=False, header=not pd.read_csv(csv_file_path).empty)

print(f"Data added to {csv_file_path}")


## **Drug Vector Space Model**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load drug-target association data from CSV
csv_file_path = '/content/drive/MyDrive/DRUG_ID_TARGET.csv'
drug_target_df = pd.read_csv(csv_file_path)

# Assuming 'DRUGNAME' column contains the drug names or identifiers
corpus = drug_target_df['DRUGNAME'].tolist()

# Vectorize the corpus using TF-IDF (Term Frequency-Inverse Document Frequency)
vectorizer = TfidfVectorizer()
drug_vectors = vectorizer.fit_transform(corpus)

# Calculate cosine similarity between drug vectors
similarity_matrix = cosine_similarity(drug_vectors, drug_vectors)

# Plot the similarity matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(similarity_matrix, cmap='viridis', annot=False)
plt.title('Drug Vector Similarity Matrix')
plt.xlabel('Drug Index')
plt.ylabel('Drug Index')
plt.show()

## **Drug Target Similarity Ranking**

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr, spearmanr
from nltk.metrics import jaccard_distance, masi_distance, edit_distance

# Load drug-target association data from CSV
csv_file_path = '/content/drive/MyDrive/BIO/DRUG_ID_TARGET.csv'
drug_target_df = pd.read_csv(csv_file_path)

# Assuming 'DRUGNAME' column contains the drug names or identifiers as strings
corpus = drug_target_df['DRUGNAME'].tolist()

# Vectorize the corpus using TF-IDF (Term Frequency-Inverse Document Frequency)
vectorizer = TfidfVectorizer(binary=True)
drug_vectors = vectorizer.fit_transform(corpus)

# Calculate Jaccard index for drug similarity
jaccard_similarity_matrix = np.zeros((len(corpus), len(corpus)))
for i in range(len(corpus)):
    for j in range(len(corpus)):
        jaccard_similarity_matrix[i, j] = 1 - jaccard_distance(set(corpus[i]), set(corpus[j]))

# Calculate Masi distance for drug similarity (a modification of Jaccard for partial agreement)
masi_similarity_matrix = np.zeros((len(corpus), len(corpus)))
for i in range(len(corpus)):
    for j in range(len(corpus)):
        masi_similarity_matrix[i, j] = 1 - masi_distance(set(corpus[i]), set(corpus[j]))

# Calculate Edit distance for drug similarity
edit_similarity_matrix = np.zeros((len(corpus), len(corpus)))
for i in range(len(corpus)):
    for j in range(len(corpus)):
        edit_similarity_matrix[i, j] = 1 / (1 + edit_distance(corpus[i], corpus[j]))

# Calculate Cosine similarity for drug similarity
cosine_similarity_matrix = cosine_similarity(drug_vectors)

# Calculate Pearson correlation between similarity matrices
pearson_corr = np.corrcoef([jaccard_similarity_matrix.flatten(), masi_similarity_matrix.flatten(),
                            edit_similarity_matrix.flatten(), cosine_similarity_matrix.flatten()])[0, 1:]

# Calculate Spearman correlation between similarity matrices
spearman_corr = spearmanr(jaccard_similarity_matrix.flatten(), cosine_similarity_matrix.flatten())[0]

print("Jaccard Similarity Matrix:")
print(jaccard_similarity_matrix)
print("\nMasi Similarity Matrix:")
print(masi_similarity_matrix)
print("\nEdit Similarity Matrix:")
print(edit_similarity_matrix)
print("\nCosine Similarity Matrix:")
print(cosine_similarity_matrix)

print("\nPearson Correlation between Similarity Matrices:")
print(pearson_corr)
print("\nSpearman Correlation between Jaccard and Cosine Similarity Matrices:")
print(spearman_corr)

## **Evaluation Methods**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Load combined TARGETID-DRUGID and validation set data from CSV file
combined_data_file = '/content/drive/MyDrive/DRUG_ID_TARGET.csv'

# Read combined data from CSV file
combined_data = pd.read_csv(combined_data_file)

# Group the data by 'TARGETID' and aggregate DRUGID into a list for each TARGETID
grouped_data = combined_data.groupby('TARGETID')['DRUGID'].agg(list).reset_index()

# Create a dictionary mapping TARGETIDs to corresponding DRUGID
TARGETID_DRUGID_mapping = dict(zip(grouped_data['TARGETID'], grouped_data['DRUGID']))

# Create a dictionary mapping TARGETIDs to relevant DRUGID from the validation set
validation_set = dict(zip(combined_data['TARGETID'], combined_data['DRUGID'].str.split(',')))

# Function to calculate precision at k
def precision_at_k(relevant_items, recommended_items, k):
    intersection = set(recommended_items[:k]) & set(relevant_items)
    return len(intersection) / min(k, len(relevant_items)), len(intersection)  # Return count of matches as well

# Function to calculate average precision
def average_precision(relevant_items, recommended_items):
    precisions = [
        precision_at_k(relevant_items, recommended_items, k + 1)
        for k in range(len(recommended_items))
        if recommended_items[k] in relevant_items
    ]
    if not precisions:
        return 0.0, 0  # Return 0 matches if no precisions calculated
    avg_precision = sum(p[0] for p in precisions) / len(relevant_items)
    total_matches = sum(p[1] for p in precisions)
    return avg_precision, total_matches

# Calculate Mean Average Precision (MAP) and count of matches for different percentages of potential repurposed DRUGID
percentages = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]  # Percentage values for top repurposed DRUGID
TARGETID_count = 10000  # Assuming you reduced TARGETIDs to the top 2000 most frequent TARGETIDs

# Lists to store MAP scores and total match counts for each percentage
map_scores = []
match_counts = []

for percentage in percentages:
    top_percent = int(max(len(DRUGID) for DRUGID in TARGETID_DRUGID_mapping.values()) * percentage)
    map_scores_for_percentage = []
    match_counts_for_percentage = []
    for TARGETID, DRUGID in list(TARGETID_DRUGID_mapping.items())[:TARGETID_count]:
        recommended_items = DRUGID[:top_percent]
        relevant_items = validation_set.get(TARGETID, [])
        avg_precision, total_matches = average_precision(relevant_items, recommended_items)
        map_scores_for_percentage.append(avg_precision)
        match_counts_for_percentage.append(total_matches)
    mean_map_score = sum(map_scores_for_percentage) / len(map_scores_for_percentage)
    total_match_count = sum(match_counts_for_percentage)
    map_scores.append(mean_map_score)
    match_counts.append(total_match_count)
    print(f"MAP for top {percentage * 100}%: {mean_map_score}, Total Matches: {total_match_count}")

# Convert lists to numpy arrays if they aren't already
percentages = np.array(percentages)
map_scores = np.array(map_scores)


# Plotting the MAP scores against percentages
plt.figure(figsize=(8, 6))
plt.plot(percentages, map_scores, marker='o')
plt.title('MAP Scores for Different Percentages of Repurposed Drugs')
plt.xlabel('Percentage (%)')
plt.ylabel('MAP Score')
plt.grid(True)
plt.show()

## **Unmatched Gene**

In [None]:
df_pubmed = pd.read_csv("/content/drive/MyDrive/BIO/GENE_GROUPED.csv")
df_ttd = pd.read_csv("/content/drive/MyDrive/BIO/ID_GENE.csv")
# Assuming both CSV files have a column named 'Gene' containing gene names
pubmed_genes = set(df_pubmed['Gene'].tolist())
ttd_genes = set(df_ttd['GENENAME'].tolist())

# Find unmatched genes
unmatched_genes_pubmed = pubmed_genes - ttd_genes
unmatched_genes_ttd = ttd_genes - pubmed_genes

print("Unmatched genes:")
print(unmatched_genes_pubmed)
print(len(unmatched_genes_pubmed))

## **Visualization**

In [None]:
import seaborn as sns
# Simulated function to calculate similarity score between drugs and diseases
def calculate_similarity(drugs, diseases):
    similarity_matrix = np.random.rand(len(drugs), len(diseases))  # Replace this with your actual similarity calculation logic
    return similarity_matrix

# Sample data
drugs = combined_data['DRUGID'].unique()  # Get unique drugs
diseases = combined_data['TARGETID'].unique()  # Get unique diseases

# Calculate similarity matrix between drugs and diseases
similarity_matrix = calculate_similarity(drugs, diseases)

# Example output
print(similarity_matrix)

# Visualization 2: Heatmap for Drug-Disease Relationship based on RDS values
plt.figure(figsize=(10, 8))
sns.heatmap(similarity_matrix, cmap='coolwarm', annot=True, fmt=".2f")
plt.title('Drug Repurposing: Drug-Disease Relationship (RDS values)')
plt.xlabel('Disease Index')
plt.ylabel('Drug Index')
plt.show()

# Additional analysis and visualization steps can be added as needed