# Change the Runtime to R

In [None]:
from google.colab import drive  # Importing Google Drive module for Colab
drive.mount('/content/drive')  # Mounting Google Drive to Colab

In [None]:
# Importing necessary packages and libraries for the task
install.packages("pubmed.mineR")  # Installing the pubmed.mineR package

In [None]:
library(pubmed.mineR)  # Loading the pubmed.mineR library

In [None]:
# Load the necessary libraries
library(httr)  # Loading httr for HTTP requests
library(jsonlite)  # Loading jsonlite for handling JSON data

### Give the CSV file name of Respective Traditional Medicine From Pubmed

In [None]:
# Load the updated CSV containing PMID and Abstract
data <- read.csv("den.csv")  # Reading the CSV file into a data frame

### Generating the Pubtator Result

In [None]:
# Define the PubTator function
pubtator_function <- function(x) {
  url <- paste("https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/pubtator?pmids=", x, sep = "")  # Creating the URL for PubTator API

  tryCatch({
    response <- GET(url)  # Sending GET request to the API
    content <- content(response, "text", encoding = "UTF-8")  # Extracting content from the response
    lines <- unlist(strsplit(content, "\n", fixed = TRUE))  # Splitting the content into lines

    table1 <- NULL
    for (i in 3:length(lines)) {  # Iterating over lines starting from the third line
      temps <- unlist(strsplit(lines[i], "\t", fixed = TRUE))  # Splitting each line by tab character
      if (length(temps) == 5) {  # Adding "No Data" if the length of temps is 5
        temps <- c(temps, "No Data")
      }
      table1 <- rbind(table1, temps)  # Appending the split data to table1
    }

    if (ncol(table1) == 6) {  # Checking if table1 has 6 columns
      table2 <- as.data.frame(table1)  # Converting table1 to a data frame
      colnames(table2) <- c("PMID", "Start", "End", "Term", "TermType", "TermID")  # Naming the columns
      gene <- NULL
      disease <- NULL
      mutation <- NULL
      chemical <- NULL
      species <- NULL
      for (i in 1:length(table2$TermType)) {  # Iterating over TermType column
        if (table2$TermType[i] == "Gene")
          gene <- c(gene, paste(table2$Term[i], table2$TermID[i], sep = ">"))
        else if (table2$TermType[i] == "Disease")
          disease <- c(disease, paste(table2$Term[i], table2$TermID[i], sep = ">"))
        else if (table2$TermType[i] == "Mutation")
          mutation <- c(mutation, paste(table2$Term[i], table2$TermID[i], sep = ">"))
        else if (table2$TermType[i] == "Chemical")
          chemical <- c(chemical, paste(table2$Term[i], table2$TermID[i], sep = ">"))
        else if (table2$TermType[i] == "Species")
          species <- c(species, paste(table2$Term[i], table2$TermID[i], sep = ">"))
      }
      gene <- unique(gene)  # Removing duplicate genes
      disease <- unique(disease)  # Removing duplicate diseases
      mutation <- unique(mutation)  # Removing duplicate mutations
      chemical <- unique(chemical)  # Removing duplicate chemicals
      species <- unique(species)  # Removing duplicate species

      result <- data.frame(PMID = x, Genes = paste(gene, collapse = "; "), Diseases = paste(disease, collapse = "; "),
                          Mutations = paste(mutation, collapse = "; "), Chemicals = paste(chemical, collapse = "; "),
                          Species = paste(species, collapse = "; "))  # Creating a data frame with the results
      return(result)  # Returning the result
    } else {
      result <- data.frame(PMID = x, Genes = "", Diseases = "", Mutations = "", Chemicals = "", Species = "")  # Handling cases with no data
      return(result)  # Returning the result
    }
  }, error = function(e) {
    result <- data.frame(PMID = x, Genes = "", Diseases = "", Mutations = "", Chemicals = "", Species = "")  # Handling errors
    return(result)  # Returning the result
  })
}

# Create an empty data frame to store results
results_df <- data.frame(PMID = numeric(0), Genes = character(0), Diseases = character(0), Mutations = character(0),
                        Chemicals = character(0), Species = character(0))  # Initializing an empty data frame

# Loop through the PMIDs in the CSV
for (pmid in data$PMID) {  # Iterating over each PMID in the data
  result <- pubtator_function(pmid)  # Calling the pubtator_function for each PMID
  results_df <- rbind(results_df, result)  # Appending the result to the results_df
}

# Write the results to a CSV file
write.csv(results_df, "Pubtator_Result_Traditional_Medicine.csv", row.names = FALSE)  # Writing the results to a CSV file

Separate the Genes and its Respective Data from the above Pubtator Result File

In [None]:
# Load the necessary libraries
library(dplyr)  # Loading dplyr for data manipulation
library(tidyr)  # Loading tidyr for data tidying
library(stringr)  # Loading stringr for string operations

# Load the pubtator_results CSV
pubtator_results <- read.csv("pub.csv", stringsAsFactors = FALSE)  # Reading the pubtator results CSV

# Separate the Genes column into a list of genes
gene_list <- strsplit(pubtator_results$Genes, "; ")  # Splitting the Genes column by "; "

# Create a data frame with one row per gene
gene_df <- data.frame(Gene = unlist(gene_list), PMID = rep(pubtator_results$PMID, sapply(gene_list, length)))  # Creating a data frame with one row per gene

# Extract gene name and gene ID
gene_df <- gene_df %>%
  separate(Gene, into = c("GeneName", "GeneId"), sep = ">")  # Splitting the Gene column into GeneName and GeneId

# Count the number of times each gene appears and list of PMIDs
gene_counts <- gene_df %>%
  group_by(GeneName, GeneId) %>%
  summarise(Count = n(),
            PMIDList = paste(unique(PMID), collapse = ", "))  # Grouping by GeneName and GeneId and summarizing the counts and PMIDs

# Write the gene counts to a new CSV
write.csv(gene_counts, "Chinese_gene_count.csv", row.names = FALSE)  # Writing the gene counts to a new CSV


Separate the Chemical and its Respective Data from the above Pubtator Result File

In [None]:

# Load the necessary libraries
library(dplyr)  # Loading dplyr for data manipulation
library(tidyr)  # Loading tidyr for data tidying
library(stringr)  # Loading stringr for string operations

# Load the pubtator_results CSV
pubtator_results <- read.csv("pub.csv", stringsAsFactors = FALSE)  # Reading the pubtator results CSV

# Separate the Chemicals column into a list of chemicals
chemical_list <- strsplit(pubtator_results$Chemicals, "; ")  # Splitting the Chemicals column by "; "

# Create a data frame with one row per chemical
chemical_df <- data.frame(Chemical = unlist(chemical_list), PMID = rep(pubtator_results$PMID, sapply(chemical_list, length)))  # Creating a data frame with one row per chemical

# Extract chemical name and chemical ID
chemical_df <- chemical_df %>%
  separate(Chemical, into = c("ChemicalName", "ChemicalId"), sep = ">")  # Splitting the Chemical column into ChemicalName and ChemicalId

# Count the number of times each chemical appears and list of PMIDs
chemical_counts <- chemical_df %>%
  group_by(ChemicalName, ChemicalId) %>%
  summarise(Count = n(),
            PMIDList = paste(unique(PMID), collapse = ", "))  # Grouping by ChemicalName and ChemicalId and summarizing the counts and PMIDs

# Write the chemical counts to a new CSV
write.csv(chemical_counts, "Chinese_chemical_counts.csv", row.names = FALSE)  # Writing the chemical counts to a new CSV



Separate the Disease and its Respective Data from the above Pubtator Result File

In [None]:

# Load the necessary libraries
library(dplyr)  # Loading dplyr for data manipulation
library(tidyr)  # Loading tidyr for data tidying
library(stringr)  # Loading stringr for string operations

# Load the pubtator_results CSV
pubtator_results <- read.csv("pub.csv", stringsAsFactors = FALSE)  # Reading the pubtator results CSV

# Separate the Diseases column into a list of diseases
disease_list <- strsplit(pubtator_results$Diseases, "; ")  # Splitting the Diseases column by "; "

# Create a data frame with one row per disease
disease_df <- data.frame(Disease = unlist(disease_list), PMID = rep(pubtator_results$PMID, sapply(disease_list, length)))  # Creating a data frame with one row per disease

# Extract disease name and disease ID
disease_df <- disease_df %>%
  separate(Disease, into = c("DiseaseName", "DiseaseId"), sep = ">")  # Splitting the Disease column into DiseaseName and DiseaseId

# Count the number of times each disease appears and list of PMIDs
disease_counts <- disease_df %>%
  group_by(DiseaseName, DiseaseId) %>%
  summarise(Count = n(),
            PMIDList = paste(unique(PMID), collapse = ", "))  # Grouping by DiseaseName and DiseaseId and summarizing the counts and PMIDs

# Write the disease counts to a new CSV
write.csv(disease_counts, "Chinese_disease_counts.csv", row.names = FALSE)  # Writing the disease counts to a new CSV



# Change Runtime to Python

### Give the CSV File path name for Gene, Disease, Chemical

In [None]:
gene_count = '/content/drive/MyDrive/IP_new/IP/Traditional Chinese Medicine/Chinese_gene_count.csv'
disease_count = '/content/drive/MyDrive/IP_new/IP/Traditional Chinese Medicine/Chinese_disease_counts.csv'
chemical_count = '/content/drive/MyDrive/IP_new/IP/Traditional Chinese Medicine/Chinese_chemical_counts.csv'

### Give the CSV file path name for the traditional medicine which was downloaded from pubmed

In [None]:
main_file = '/content/drive/MyDrive/IP_new/IP/Traditional Chinese Medicine/csv-traditiona-set.csv'

### Matching PMID with PMCID from Main File and storing it for GiveSentence

In [None]:
import pandas as pd

# Read the data from the first sheet containing gene and PMID columns
df_sheet1 = pd.read_csv(gene_count)

# Read the data from the second sheet containing PMID and PMCID columns
df_sheet2 = pd.read_csv(main_file)

# Convert 'PMID' column in df_sheet2 to string data type
df_sheet2['PMID'] = df_sheet2['PMID'].astype(str)

# Split the PMIDList in df_sheet1 into individual rows
df_sheet1 = df_sheet1.explode('PMIDList')

# Merge the two dataframes based on the common column PMID
merged_df = pd.merge(df_sheet1, df_sheet2, left_on='PMIDList', right_on='PMID', how='left')

# Group by GeneName and aggregate PMCID values into a list
df_sheet3 = merged_df.groupby('GeneName')['PMCID'].agg(list).reset_index()

# Write the result to a new CSV file
df_sheet3.to_csv('chinese_merged_gn.csv',index=False)

In [None]:
import pandas as pd

# Read the data from the first sheet containing gene and PMID columns
df_sheet1 = pd.read_csv(disease_count)

# Read the data from the second sheet containing PMID and PMCID columns
df_sheet2 = pd.read_csv(main_file)

# Convert 'PMID' column in df_sheet2 to string data type
df_sheet2['PMID'] = df_sheet2['PMID'].astype(str)

# Split the PMIDList in df_sheet1 into individual rows
df_sheet1 = df_sheet1.explode('PMIDList')

# Merge the two dataframes based on the common column PMID
merged_df = pd.merge(df_sheet1, df_sheet2, left_on='PMIDList', right_on='PMID', how='left')

# Group by GeneName and aggregate PMCID values into a list
df_sheet3 = merged_df.groupby('DiseaseName')['PMCID'].agg(list).reset_index()

# Write the result to a new CSV file
df_sheet3.to_csv('chinese_merged_dh.csv',index=False)

In [None]:
import pandas as pd

# Read the data from the first sheet containing gene and PMID columns
df_sheet1 = pd.read_csv(chemical_count)

# Read the data from the second sheet containing PMID and PMCID columns
df_sheet2 = pd.read_csv(main_file)

# Convert 'PMID' column in df_sheet2 to string data type
df_sheet2['PMID'] = df_sheet2['PMID'].astype(str)

# Split the PMIDList in df_sheet1 into individual rows
df_sheet1 = df_sheet1.explode('PMIDList')

# Merge the two dataframes based on the common column PMID
merged_df = pd.merge(df_sheet1, df_sheet2, left_on='PMIDList', right_on='PMID', how='left')

# Group by GeneName and aggregate PMCID values into a list
df_sheet3 = merged_df.groupby('ChemicalName')['PMCID'].agg(list).reset_index()

# Write the result to a new CSV file
df_sheet3.to_csv('chinese_merged_ch.csv',index=False)

## Preprocessing the New CSV file

### Change the File for each csv file i.e., Chemical,Gene and Disease

In [None]:
file_name= "/content/chinese_merged_ch.csv"

In [None]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv(p)

# Check for null values in any cell and drop the corresponding rows
df = df.dropna()

# Write the modified DataFrame to a new CSV file
df.to_csv(new_file_name, index=False)


In [None]:
new_file_name = "/content/chinese_merged_gn.csv"

In [None]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv(new_file_name)

# Drop rows where ChemicalName contains characters other than alphanumeric and hyphen
df = df[df['GeneName'].str.contains(r'^[a-zA-Z0-9\-]+$')]

# Write the modified DataFrame to a new CSV file
df.to_csv(new_file_name, index=False)


# Change Runtime to R

### It Takes the Input as CSV as Disease Count and Generates GiveSentence





In [None]:

# Read the CSV file
data <- read.csv("Chinese_disease_counts.csv")  # Reading the CSV file

# Extract GeneName column into a list
GeneName_list <- data$DiseaseName  # Extracting the DiseaseName column into a list

# Extract PMCID column into a list
PMCID_list <- data$PMCID  # Extracting the PMCID column into a list

# Function to extract numeric part from PMCID
extract_numeric_part <- function(pmcid) {
  if (pmcid != "[nan]") {
    return(as.numeric(gsub("[^0-9]", "", pmcid)))  # Extracting numeric part from PMCID
  } else {
    return(NA)
  }
}

# Extract numeric part from PMCID and convert to list of lists
PMIDList_list <- lapply(PMCID_list, extract_numeric_part)  # Applying the function to extract numeric parts

# Create an empty data frame to store results
results_df <- data.frame(
  gene_name = character(),
  pmid = character(),
  sentences = character(),
  stringsAsFactors = FALSE
)  # Initializing an empty data frame

# Iterate over indices of GeneName_list
for (i in seq_along(GeneName_list)) {  # Iterating over each element in GeneName_list
  # Get gene name
  gene_name <- GeneName_list[[i]]

  # Check if gene_name has missing '(' or ')'
  if (grepl("[()]", gene_name)) {
    # Add missing parentheses
    if (!grepl("\\(", gene_name)) {
      gene_name <- paste0("(", gene_name)
    }
    if (!grepl("\\)", gene_name)) {
      gene_name <- paste0(gene_name, ")")
    }
  }

  # Get corresponding PMID list
  pmid_list <- PMIDList_list[[i]]

  # Iterate over PMIDs in the list
  for (pmid in pmid_list) {
    # Check if pmid is not NA
    if (!is.na(pmid)) {
      # Call Give_Sentences_PMC function for the gene name and PMID
      sentences <- Give_Sentences_PMC(pmid, gene_name)

      # Check if sentences are not empty
      if (length(sentences) > 0) {
        # Concatenate all sentences into a single string
        concatenated_sentences <- paste(sentences, collapse = " ")

        # Add the results to the data frame
        results_df <- rbind(results_df, data.frame(gene_name = gene_name, pmid = pmid, sentences = concatenated_sentences))
      }
    }
  }
}

# Write the results data frame to a CSV file
write.csv(results_df, "results_chinese_dh.csv", row.names = FALSE)  # Writing the results data frame to a CSV file


### It Takes the Input as CSV as Gene Count and Generates GiveSentence

In [None]:

# Read the CSV file
data <- read.csv("/content/drive/MyDrive/IP_new/IP/Traditional Chinese Medicine/chinese_merged_gn.csv")  # Reading the merged gene CSV file

# Extract GeneName column into a list
GeneName_list <- data$GeneName  # Extracting the GeneName column into a list

# Extract PMCID column into a list
PMCID_list <- data$PMCID  # Extracting the PMCID column into a list

# Function to extract numeric part from PMCID
extract_numeric_part <- function(pmcid) {
  if (pmcid != "[nan]") {
    return(as.numeric(gsub("[^0-9]", "", pmcid)))  # Extracting numeric part from PMCID
  } else {
    return(NA)
  }
}

# Extract numeric part from PMCID and convert to list of lists
PMIDList_list <- lapply(PMCID_list, extract_numeric_part)  # Applying the function to extract numeric parts

# Create an empty data frame to store results
results_df <- data.frame(
  gene_name = character(),
  pmid = character(),
  sentences = character(),
  stringsAsFactors = FALSE
)  # Initializing an empty data frame

# Iterate over indices of GeneName_list
for (i in seq_along(GeneName_list)) {  # Iterating over each element in GeneName_list
  # Get gene name
  gene_name <- GeneName_list[[i]]

  # Check if gene_name has missing '(' or ')'
  if (grepl("[()]", gene_name)) {
    # Add missing parentheses
    if (!grepl("\\(", gene_name)) {
      gene_name <- paste0("(", gene_name)
    }
    if (!grepl("\\)", gene_name)) {
      gene_name <- paste0(gene_name, ")")
    }
  }

  # Get corresponding PMID list
  pmid_list <- PMIDList_list[[i]]

  # Iterate over PMIDs in the list
  for (pmid in pmid_list) {
    # Check if pmid is not NA
    if (!is.na(pmid)) {
      # Call Give_Sentences_PMC function for the gene name and PMID
      sentences <- Give_Sentences_PMC(pmid, gene_name)

      # Check if sentences are not empty
      if (length(sentences) > 0) {
        # Concatenate all sentences into a single string
        concatenated_sentences <- paste(sentences, collapse = " ")

        # Add the results to the data frame
        results_df <- rbind(results_df, data.frame(gene_name = gene_name, pmid = pmid, sentences = concatenated_sentences))
      }
    }
  }
}

# Write the results data frame to a CSV file
write.csv(results_df, "results_Chinese_gn.csv", row.names = FALSE)  # Writing the results data frame to a CSV file


### It Takes the Input as CSV as Chemical Count and Generates GiveSentence

In [None]:

# Read the CSV file
data <- read.csv("/content/drive/MyDrive/IP_new/IP/Traditional Chinese Medicine/chinese_merged_ch.csv")  # Reading the merged chemical CSV file

# Extract GeneName column into a list
GeneName_list <- data$ChemicalName  # Extracting the ChemicalName column into a list

# Extract PMCID column into a list
PMCID_list <- data$PMCID  # Extracting the PMCID column into a list

# Function to extract numeric part from PMCID
extract_numeric_part <- function(pmcid) {
  if (pmcid != "[nan]") {
    return(as.numeric(gsub("[^0-9]", "", pmcid)))  # Extracting numeric part from PMCID
  } else {
    return(NA)
  }
}

# Extract numeric part from PMCID and convert to list of lists
PMIDList_list <- lapply(PMCID_list, extract_numeric_part)  # Applying the function to extract numeric parts

# Create an empty data frame to store results
results_df <- data.frame(
  gene_name = character(),
  pmid = character(),
  sentences = character(),
  stringsAsFactors = FALSE
)  # Initializing an empty data frame

# Iterate over indices of GeneName_list
for (i in seq_along(GeneName_list)) {  # Iterating over each element in GeneName_list
  # Get gene name
  gene_name <- GeneName_list[[i]]

  # Check if gene_name has missing '(' or ')'
  if (grepl("[()]", gene_name)) {
    # Add missing parentheses
    if (!grepl("\\(", gene_name)) {
      gene_name <- paste0("(", gene_name)
    }
    if (!grepl("\\)", gene_name)) {
      gene_name <- paste0(gene_name, ")")
    }
  }

  # Get corresponding PMID list
  pmid_list <- PMIDList_list[[i]]

  # Iterate over PMIDs in the list
  for (pmid in pmid_list) {
    # Check if pmid is not NA
    if (!is.na(pmid)) {
      # Call Give_Sentences_PMC function for the gene name and PMID
      sentences <- Give_Sentences_PMC(pmid, gene_name)

      # Check if sentences are not empty
      if (length(sentences) > 0) {
        # Concatenate all sentences into a single string
        concatenated_sentences <- paste(sentences, collapse = " ")

        # Add the results to the data frame
        results_df <- rbind(results_df, data.frame(gene_name = gene_name, pmid = pmid, sentences = concatenated_sentences))
      }
    }
  }
}

# Write the results data frame to a CSV file
write.csv(results_df, "results_Chinese_ch.csv", row.names = FALSE)  # Writing the results data frame to a CSV file


# Change Runtime to Python For INTERACTION

In [None]:
import pandas as pd
import re

# Load the Chemicals and Genes CSV files into dataframes
chemicals_df = pd.read_csv('/content/drive/MyDrive/IP/AyurVeda_New/results_ayu_ch.csv')
genes_df = pd.read_csv('/content/drive/MyDrive/IP/AyurVeda_New/results_ayu_gn.csv')
disease_df = pd.read_csv('/content/drive/MyDrive/IP/AyurVeda_New/results_ayu_dh.csv')

In [None]:
cols = ['pmid','Chemical_name', 'sentences']
chemicals_df = chemicals_df[cols]

In [None]:
cols = ['gene_name','pmid', 'sentences']
genes_df = genes_df[cols]

In [None]:
cols = ['disease_name','pmid', 'sentences']
disease_df = disease_df[cols]

## Interaction Between Gene Chemicals

In [None]:
# Initialize lists to store the extracted data

pmcid_list = []
sentence_list = []
genes_list = []
chemicals_list = []
interaction_type_list = []
regulation_list = []

# Define regular expressions for different interaction types
interaction_patterns = {
    'Inhibition': r'\b(inhibit(?:s|ing)?|inhibition)\b',
    'Activation': r'\b(activate(?:s|ing)?|activation)\b',
    'Proliferation': r'\bproliferation\b',
    'Allosteric': r'\ballosteric\b',
    'Agonist': r'\bagonist\b',
    'Antagonist': r'\bantagonist\b'
}

# Iterate through each row in the Chemicals dataframe
for chem_row in chemicals_df.itertuples():
    for gene_row in genes_df.itertuples():
        # Check if the PMID and PMCID match in both dataframes
        if  chem_row.sentences == gene_row.sentences:
            sentence = str(chem_row.sentences)  # Ensure sentence is a string
            chemicals = str(chem_row.Chemical_name)  # Convert to string
            genes = str(gene_row.gene_name)  # Convert to string

            # Initialize interaction type and regulation as 'Other'
            interaction_type = 'Other'
            regulation = 'Other'

            # Check for interaction type using regular expressions
            for key, pattern in interaction_patterns.items():
                if re.search(pattern, sentence, re.IGNORECASE):
                    interaction_type = key
                    break

            # Use regular expressions to identify regulation
            if re.search(r'\b(up(?:-| )?regulated)\b', sentence, re.IGNORECASE):
                regulation = 'Up'
            elif re.search(r'\b(down(?:-| )?regulated)\b', sentence, re.IGNORECASE):
                regulation = 'Down'

            # Append the extracted data to respective lists

            pmcid_list.append(chem_row.pmid)
            sentence_list.append(sentence)
            genes_list.append(genes)
            chemicals_list.append(chemicals)
            interaction_type_list.append(interaction_type)
            regulation_list.append(regulation)

# Create a new dataframe with the extracted data
result_df = pd.DataFrame({

    'pmid': pmcid_list,
    'Sentence': sentence_list,
    'Genes': genes_list,
    'Chemicals': chemicals_list,
    'Interaction type': interaction_type_list,
    'Regulation': regulation_list
})

# Save the new dataframe to a CSV file
result_df.to_csv('interaction_chemical_gene.csv', index=False)



## Interaction Between Gene Disease

In [None]:
pmcid_list = []
sentence_list = []
genes_list = []
diseases_list = []
interaction_type_list = []
regulation_list = []

# Define regular expressions for different interaction types
interaction_patterns = {
    'Inhibition': r'\b(inhibit(?:s|ing)?|inhibition)\b',
    'Activation': r'\b(activate(?:s|ing)?|activation)\b',
    'Proliferation': r'\bproliferation\b',
    'Allosteric': r'\ballosteric\b',
    'Agonist': r'\bagonist\b',
    'Antagonist': r'\bantagonist\b'
}

# Iterate through each row in the Chemicals dataframe
for dis_row in disease_df.itertuples():
    for gene_row in genes_df.itertuples():
        # Check if the PMID and PMCID match in both dataframes
        if dis_row.pmid == gene_row.pmid:
            sentence = str(dis_row.sentences)  # Ensure sentence is a string
            diseases = str(dis_row.gene_name)  # Convert to string
            genes = str(gene_row.gene_name)  # Convert to string

            # Initialize interaction type and regulation as 'Other'
            interaction_type = 'Other'
            regulation = 'Other'

            # Check for interaction type using regular expressions
            for key, pattern in interaction_patterns.items():
                if re.search(pattern, sentence, re.IGNORECASE):
                    interaction_type = key
                    break

            # Use regular expressions to identify regulation
            if re.search(r'\b(up(?:-| )?regulated)\b', sentence, re.IGNORECASE):
                regulation = 'Up'
            elif re.search(r'\b(down(?:-| )?regulated)\b', sentence, re.IGNORECASE):
                regulation = 'Down'

            # Append the extracted data to respective lists

            pmcid_list.append(dis_row.pmid)
            sentence_list.append(sentence)
            genes_list.append(genes)
            diseases_list.append(diseases)
            interaction_type_list.append(interaction_type)
            regulation_list.append(regulation)

# Create a new dataframe with the extracted data
result_df = pd.DataFrame({

    'pmid': pmcid_list,
    'Sentence': sentence_list,
    'Genes': genes_list,
    'Diseases': diseases_list,
    'Interaction type': interaction_type_list,
    'Regulation': regulation_list
})

# Save the new dataframe to a CSV file
result_df.to_csv('interaction_Gene_Disease.csv', index=False)

## Remove Duplicate Rows


In [None]:
result_df.drop_duplicates()