# **Gene to Chemical**

In [1]:
import pandas as pd
import tensorflow_hub as hub
import re
import numpy as np

# File paths
file1_path = '/kaggle/input/taks-4-dataset/Task4_Secondary_Metabolties_Marine_Genes_pubmed.csv'
file2_path = '/kaggle/input/taks-4-dataset/Task4_Secondary_Metabolties_Marine_Chemicals_pubmed.csv'

# Load DataFrames
df1 = pd.read_csv(file1_path)
df2 = pd.read_csv(file2_path)

# Rename columns for consistency
df1.rename(columns={'Sentences': 'Sentences_x', 'PMCID': 'PMCID_x', 'Genes': 'Gene'}, inplace=True)
df2.rename(columns={'Sentences': 'Sentences_y', 'PMCID': 'PMCID_y', 'Chemicals': 'Chemical'}, inplace=True)

# Merge DataFrames based on common column (PMID)
merged_df = pd.merge(df1, df2, on='PMID', how='inner')

# Define keywords for interaction type and regulation (with regex patterns)
interaction_keywords = ['Inhibition', 'Activation', 'Proliferation', 'Allosteric', 'Agonist', 'Antagonist']
regulation_keywords = ['(?:Up(?:-regulated)?)', 'Down(?:-regulated)?']

# Initialize Interaction type and Regulation columns
merged_df['Interaction type'] = ''
merged_df['Regulation'] = ''

# Define regex patterns for regulation
regulation_pattern = '|'.join(regulation_keywords)

# Load Universal Sentence Encoder (USE)
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
embed = hub.load(module_url)

# Ensure all sentences are treated as strings and handle NaNs
merged_df['Sentences_x'] = merged_df['Sentences_x'].astype(str).fillna('')

# Encode sentences using Universal Sentence Encoder
sentence_embeddings = embed(merged_df['Sentences_x'].tolist())

# Define a function to extract regulation from the sentence
def extract_regulation(sentence):
    regulation_match = re.search(regulation_pattern, sentence, flags=re.IGNORECASE)
    if regulation_match:
        return regulation_match.group()
    else:
        return ''

# Extract regulation for each sentence
merged_df['Regulation'] = merged_df['Sentences_x'].apply(extract_regulation)

# Function to find the closest matching keyword from a list
def find_closest_keyword(sentence_embedding, keywords, threshold=0.2):
    max_similarity = -1
    closest_keyword = ''
    for keyword in keywords:
        keyword_embedding = embed([keyword])[0].numpy()
        similarity = np.dot(sentence_embedding, keyword_embedding)
        if similarity > max_similarity:
            closest_keyword = keyword
            max_similarity = similarity
    if max_similarity >= threshold:
        return closest_keyword
    else:
        return ''

# Extract interaction type for each sentence
for index, sentence_embedding in enumerate(sentence_embeddings):
    closest_interaction = find_closest_keyword(sentence_embedding.numpy(), interaction_keywords)
    merged_df.at[index, 'Interaction type'] = closest_interaction

# Select final columns for the new dataframe
final_columns = ['PMID', 'PMCID_x', 'Sentences_x', 'Gene', 'Chemical', 'Interaction type', 'Regulation']
final_df = merged_df[final_columns]

# Rename columns
final_df.columns = ['PMID', 'PMCID', 'Sentence', 'Genes', 'ChemicalName', 'Interaction type', 'Regulation']

# Save the final dataframe as CSV
output_path = '/kaggle/working/Gene_to_Chemical.csv'
final_df.to_csv(output_path, index=False)

print("Merged data saved successfully in /kaggle/working/Gene_to_Chemical.csv!")


2024-07-31 13:34:11.494037: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-31 13:34:11.494206: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-31 13:34:11.676116: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Merged data saved successfully in /kaggle/working/Gene_to_Chemical.csv!


# Gene to Disease

In [6]:
import pandas as pd
import tensorflow_hub as hub
import re
import numpy as np

# File paths
file1_path = '/kaggle/input/taks-4-dataset/Task4_Secondary_Metabolties_Marine_Genes_pubmed.csv'
file2_path = '/kaggle/input/taks-4-dataset/Task4_Secondary_Metabolties_Marine_Disease_pubmed.csv'

# Load DataFrames
df1 = pd.read_csv(file1_path)
df2 = pd.read_csv(file2_path)

# Rename columns for consistency
df1.rename(columns={'Sentences': 'Sentences_x', 'PMCID': 'PMCID_x', 'Genes': 'Gene'}, inplace=True)
df2.rename(columns={'Sentences': 'Sentences_y', 'PMCID': 'PMCID_y', 'Diseases': 'Disease'}, inplace=True)

# Ensure 'PMID' columns are of the same type
df1['PMID'] = df1['PMID'].astype(str)
df2['PMID'] = df2['PMID'].astype(str)

# Merge DataFrames based on common column (PMID)
merged_df = pd.merge(df1, df2, on='PMID', how='inner')

# Define keywords for interaction type and regulation (with regex patterns)
interaction_keywords = ['Inhibition', 'Activation', 'Proliferation', 'Allosteric', 'Agonist', 'Antagonist']
regulation_keywords = ['(?:Up(?:-regulated)?)', 'Down(?:-regulated)?']

# Initialize Interaction type and Regulation columns
merged_df['Interaction type'] = ''
merged_df['Regulation'] = ''

# Define regex patterns for regulation
regulation_pattern = '|'.join(regulation_keywords)

# Load Universal Sentence Encoder (USE)
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
embed = hub.load(module_url)

# Ensure all sentences are treated as strings and handle NaNs
merged_df['Sentences_x'] = merged_df['Sentences_x'].astype(str).fillna('')

# Encode sentences using Universal Sentence Encoder
sentence_embeddings = embed(merged_df['Sentences_x'].tolist())

# Define a function to extract regulation from the sentence
def extract_regulation(sentence):
    regulation_match = re.search(regulation_pattern, sentence, flags=re.IGNORECASE)
    if regulation_match:
        return regulation_match.group()
    else:
        return ''

# Extract regulation for each sentence
merged_df['Regulation'] = merged_df['Sentences_x'].apply(extract_regulation)

# Function to find the closest matching keyword from a list
def find_closest_keyword(sentence_embedding, keywords, threshold=0.2):
    max_similarity = -1
    closest_keyword = ''
    for keyword in keywords:
        keyword_embedding = embed([keyword])[0].numpy()
        similarity = np.dot(sentence_embedding, keyword_embedding)
        if similarity > max_similarity:
            closest_keyword = keyword
            max_similarity = similarity
    if max_similarity >= threshold:
        return closest_keyword
    else:
        return ''

# Extract interaction type for each sentence
for index, sentence_embedding in enumerate(sentence_embeddings):
    closest_interaction = find_closest_keyword(sentence_embedding.numpy(), interaction_keywords)
    merged_df.at[index, 'Interaction type'] = closest_interaction

# Select final columns for the new dataframe
final_columns = ['PMID', 'PMCID_x', 'Sentences_x', 'Gene', 'Disease', 'Interaction type', 'Regulation']
final_df = merged_df[final_columns]

# Rename columns
final_df.columns = ['PMID', 'PMCID', 'Sentence', 'Genes', 'DiseaseName', 'Interaction type', 'Regulation']

# Save the final dataframe as CSV
output_path = '/kaggle/working/Gene_to_Disease.csv'
final_df.to_csv(output_path, index=False)

print("Merged data saved successfully in /kaggle/working/Gene_to_Disease.csv!")


Merged data saved successfully in /kaggle/working/Gene_to_Disease.csv!


# **Gene to Gene**

In [16]:
import pandas as pd
import tensorflow_hub as hub
import re
import numpy as np

# File paths
file1_path = '/kaggle/input/taks-4-dataset/Task4_Secondary_Metabolties_Marine_Genes_pubmed.csv'
file2_path = '/kaggle/input/taks-4-dataset/Task4_Secondary_Metabolties_Marine_Genes_pubmed.csv'  # Update path for gene-to-gene mapping

# Load DataFrames
df1 = pd.read_csv(file1_path)
df2 = pd.read_csv(file2_path)

# Rename columns for consistency
df1.rename(columns={'Sentences': 'Sentences_x', 'PMCID': 'PMCID_x', 'Gene': 'Gene1'}, inplace=True)
df2.rename(columns={'Sentences': 'Sentences_y', 'PMCID': 'PMCID_y', 'Gene': 'Gene2'}, inplace=True)

# Ensure 'PMID' columns are of the same type
df1['PMID'] = df1['PMID'].astype(str)
df2['PMID'] = df2['PMID'].astype(str)

# Merge DataFrames based on common column (PMID)
merged_df = pd.merge(df1, df2, on='PMID', how='inner')

# Define keywords for interaction type and regulation (with regex patterns)
interaction_keywords = ['Inhibition', 'Activation', 'Proliferation', 'Allosteric', 'Agonist', 'Antagonist']
regulation_keywords = ['(?:Up(?:-regulated)?)', 'Down(?:-regulated)?']

# Initialize Interaction type and Regulation columns
merged_df['Interaction type'] = ''
merged_df['Regulation'] = ''

# Define regex patterns for regulation
regulation_pattern = '|'.join(regulation_keywords)

# Load Universal Sentence Encoder (USE)
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
embed = hub.load(module_url)

# Ensure all sentences are treated as strings and handle NaNs
merged_df['Sentences_x'] = merged_df['Sentences_x'].astype(str).fillna('')

# Encode sentences using Universal Sentence Encoder
sentence_embeddings = embed(merged_df['Sentences_x'].tolist())

# Define a function to extract regulation from the sentence
def extract_regulation(sentence):
    regulation_match = re.search(regulation_pattern, sentence, flags=re.IGNORECASE)
    if regulation_match:
        return regulation_match.group()
    else:
        return ''

# Extract regulation for each sentence
merged_df['Regulation'] = merged_df['Sentences_x'].apply(extract_regulation)

# Function to find the closest matching keyword from a list
def find_closest_keyword(sentence_embedding, keywords, threshold=0.2):
    max_similarity = -1
    closest_keyword = ''
    for keyword in keywords:
        keyword_embedding = embed([keyword])[0].numpy()
        similarity = np.dot(sentence_embedding, keyword_embedding)
        if similarity > max_similarity:
            closest_keyword = keyword
            max_similarity = similarity
    if max_similarity >= threshold:
        return closest_keyword
    else:
        return ''

# Extract interaction type for each sentence
for index, sentence_embedding in enumerate(sentence_embeddings):
    closest_interaction = find_closest_keyword(sentence_embedding.numpy(), interaction_keywords)
    merged_df.at[index, 'Interaction type'] = closest_interaction

# Select final columns for the new dataframe
final_columns = ['PMID', 'PMCID_x', 'Sentences_x', 'Gene1', 'Gene2', 'Interaction type', 'Regulation']
final_df = merged_df[final_columns]

# Rename columns
final_df.columns = ['PMID', 'PMCID', 'Sentence', 'Gene1', 'Gene2', 'Interaction type', 'Regulation']

# Save the final dataframe as CSV
output_path = '/kaggle/working/Gene_to_Gene.csv'
final_df.to_csv(output_path, index=False)

print("Merged data saved successfully in /kaggle/working/Gene_to_Gene.csv!")


Merged data saved successfully in /kaggle/working/Gene_to_Gene.csv!


# Chemical to Chemical

In [8]:
import pandas as pd
import tensorflow_hub as hub
import re
import numpy as np

# File paths
file1_path = '/kaggle/input/taks-4-dataset/Task4_Secondary_Metabolties_Marine_Chemicals_pubmed.csv'
file2_path = '/kaggle/input/taks-4-dataset/Task4_Secondary_Metabolties_Marine_Chemicals_pubmed.csv'  # Update path for chemical-to-chemical mapping

# Load DataFrames
df1 = pd.read_csv(file1_path)
df2 = pd.read_csv(file2_path)

# Rename columns for consistency
df1.rename(columns={'Sentences': 'Sentences_x', 'PMCID': 'PMCID_x', 'Chemicals': 'Chemical1'}, inplace=True)
df2.rename(columns={'Sentences': 'Sentences_y', 'PMCID': 'PMCID_y', 'Chemicals': 'Chemical2'}, inplace=True)

# Merge DataFrames based on common column (PMID)
merged_df = pd.merge(df1, df2, on='PMID', how='inner')

# Define keywords for interaction type and regulation (with regex patterns)
interaction_keywords = ['Inhibition', 'Activation', 'Proliferation', 'Allosteric', 'Agonist', 'Antagonist']
regulation_keywords = ['(?:Up(?:-regulated)?)', 'Down(?:-regulated)?']

# Initialize Interaction type and Regulation columns
merged_df['Interaction type'] = ''
merged_df['Regulation'] = ''

# Define regex patterns for regulation
regulation_pattern = '|'.join(regulation_keywords)

# Load Universal Sentence Encoder (USE)
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
embed = hub.load(module_url)

# Ensure all sentences are treated as strings and handle NaNs
merged_df['Sentences_x'] = merged_df['Sentences_x'].astype(str).fillna('')

# Encode sentences using Universal Sentence Encoder
sentence_embeddings = embed(merged_df['Sentences_x'].tolist())

# Define a function to extract regulation from the sentence
def extract_regulation(sentence):
    regulation_match = re.search(regulation_pattern, sentence, flags=re.IGNORECASE)
    if regulation_match:
        return regulation_match.group()
    else:
        return ''

# Extract regulation for each sentence
merged_df['Regulation'] = merged_df['Sentences_x'].apply(extract_regulation)

# Function to find the closest matching keyword from a list
def find_closest_keyword(sentence_embedding, keywords, threshold=0.2):
    max_similarity = -1
    closest_keyword = ''
    for keyword in keywords:
        keyword_embedding = embed([keyword])[0].numpy()
        similarity = np.dot(sentence_embedding, keyword_embedding)
        if similarity > max_similarity:
            closest_keyword = keyword
            max_similarity = similarity
    if max_similarity >= threshold:
        return closest_keyword
    else:
        return ''

# Extract interaction type for each sentence
for index, sentence_embedding in enumerate(sentence_embeddings):
    closest_interaction = find_closest_keyword(sentence_embedding.numpy(), interaction_keywords)
    merged_df.at[index, 'Interaction type'] = closest_interaction

# Select final columns for the new dataframe
final_columns = ['PMID', 'PMCID_x', 'Sentences_x', 'Chemical1', 'Chemical2', 'Interaction type', 'Regulation']
final_df = merged_df[final_columns]

# Rename columns
final_df.columns = ['PMID', 'PMCID', 'Sentence', 'Chemical1', 'Chemical2', 'Interaction type', 'Regulation']

# Save the final dataframe as CSV
output_path = '/kaggle/working/Chemical_to_Chemical.csv'
final_df.to_csv(output_path, index=False)

print("Merged data saved successfully in /kaggle/working/Chemical_to_Chemical.csv!")


Merged data saved successfully in /kaggle/working/Chemical_to_Chemical.csv!


In [None]:
  import pandas as pd

# Load CSV files into DataFrames
file1_path = '/content/Task4_Acidic_Gene_pubmed.csv'
file2_path = '/content/Task4_Acidic_Gene_pubmed.csv'

df1 = pd.read_csv(file1_path)
df2 = pd.read_csv(file2_path)

# Print column names of df1 and df2
print("Columns of df1:", df1.columns)
print("Columns of df2:", df2.columns)

In [None]:
# Define keywords for interaction type and regulation
interaction_keywords = ['Inhibition', 'Activation', 'Proliferation', 'Allosteric', 'Agonist', 'Antagonist']
regulation_keywords = ['Up', 'Down', 'Up regulated', 'Down-regulated']
# Initialize Interaction type and Regulation columns
merged_df['Interaction type'] = ''
merged_df['Regulation'] = ''

In [None]:
merged_df

In [None]:
# Iterate over each row
for index, row in merged_df.iterrows():
    # Extract the sentence
    sentence = row['Sentences_x']

    # Check for interaction type keywords
    for keyword in interaction_keywords:
        if keyword.lower() in sentence.lower():
            merged_df.at[index, 'Interaction type'] = keyword
            break

    # Check for regulation keywords
    for keyword in regulation_keywords:
        if keyword.lower() in sentence.lower():
            merged_df.at[index, 'Regulation'] = keyword
            break

In [None]:
# Select final columns for the new DataFrame
final_columns = ['PMID', 'PMCIDs_x', 'Sentences_x', 'Genes_x', 'Genes_y', 'Interaction type', 'Regulation']
final_df = merged_df[final_columns]

#PMID	PMCIDs_x	Gene_x	Sentences_x PMCIDs_x

# Rename columns
final_df.columns = ['PMID', 'PMCID', 'Sentence', 'Genes1', 'Genes2', 'Interaction type', 'Regulation']

# Save the final DataFrame as CSV
final_df.to_csv('Gene_to_Gene.csv', index=False)

print("Merged data saved successfully!")