In [9]:
!pip install requests beautifulsoup4



In [12]:
import pandas as pd
import warnings
import ast
warnings.filterwarnings("ignore")
df = pd.read_csv("/content/Task2_pubmed_secondary_metabolites_Bacteria.csv", encoding="latin-1")
df['ABSTRACT'] = df['ABSTRACT'].astype(str)
with open('/content/pathway_names.txt', 'r') as file:
    words_to_search = [word.strip().lower() for word in file.readlines()]

import requests
from bs4 import BeautifulSoup

def fetch_fulltext(pmcid):
    url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}/"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    fulltext = soup.find("div", {"class": "fullText"})
    if fulltext:
        return fulltext.get_text()
    else:
        return None

matches = {}
for index, row in df.iterrows():
    pmid = row["PMID"]
    pmcid = row['PMCID']
    if pd.notnull(pmcid):
        fulltext = fetch_fulltext(pmcid)
        if fulltext:
            paragraph = fulltext.lower()
            matches[pmid] = []
            for word in words_to_search:
                if word in paragraph:
                    matches[pmid].append(word)
        else:
            paragraph = row['ABSTRACT'].lower()
            matches[pmid] = []
            for word in words_to_search:
                if word in paragraph:
                    matches[pmid].append(word)
    else:
        paragraph = row['ABSTRACT'].lower()
        matches[pmid] = []
        for word in words_to_search:
            if word in paragraph:
                matches[pmid].append(word)

matches_df = pd.DataFrame({'pmid': list(matches.keys()), 'pathways': list(matches.values())})
matches_df.to_csv("virus_Pathways_20.csv", index=False)
pathways_df = pd.read_csv("/content/virus_Pathways_20.csv")
for index, row in pathways_df.iterrows():
    pathways_df['pathways'][index] = ast.literal_eval(pathways_df['pathways'][index])

empty_list_count = pathways_df['pathways'].apply(lambda x: len(x) == 2).sum()
print("Number of rows with empty lists:", empty_list_count)

df_exploded = pathways_df['pathways'].explode()
unique_pathways = df_exploded.unique()
print(unique_pathways)

import numpy as np

# Assuming unique_pathways is a numpy array
unique_pathways_list = unique_pathways.astype(str)

# Replace nan values with empty strings
unique_pathways_list[unique_pathways_list == 'nan'] = ''

# Convert to a list and remove duplicates
unique_pathways_list = list(set(unique_pathways_list))

# Remove empty string from the list if needed
unique_pathways_list = [pathway for pathway in unique_pathways_list if pathway]

unique_count = len(set(unique_pathways_list))
set_of_pathways = set(unique_pathways_list)
pathways_df = pd.read_csv("/content/virus_Pathways_20.csv")

pmid_series = []
pathways_series = []
length_series = []

for pathway in set_of_pathways:
    filter_df = pathways_df[pathways_df['pathways'].str.contains(pathway)]
    pmids = filter_df['pmid'].tolist()
    pmidlst = ""
    for p in pmids:
        pmidlst = pmidlst + str(p) + ","
    pmidlst = pmidlst[:-1]
    pathways_series.append(pathway)
    pmid_series.append(pmidlst)
    length_series.append(len(pmids))

consensus_pathways_df = pd.DataFrame({'pathways': pathways_series, 'PMID': pmid_series, 'length': length_series})
consensus_pathways_df.to_csv("virus_Pathways_20_final.csv", index=False)

Number of rows with empty lists: 87
[nan 'pathways' 'metabolic pathways' 'two-component system'
 'breast cancer' 'apoptosis' 'cell cycle' 'melanoma' 'carbon metabolism'
 'sulfur metabolism' 'plant-pathogen interaction' 'proteasome'
 'parkinson disease' 'nitrogen metabolism' 'prodigiosin biosynthesis'
 'streptomycin biosynthesis' 'prostate cancer' 'toxoplasmosis'
 'hepatocellular carcinoma' 'rna degradation'
 'biosynthesis of amino acids'
 'microbial metabolism in diverse environments' 'legionellosis'
 'pancreatic cancer' 'pertussis' 'osteoclast differentiation'
 'tight junction' 'anthocyanin biosynthesis' 'nucleotide excision repair'
 'glioma' 'base excision repair' 'ampk signaling pathway'
 'bacterial secretion system' 'methane metabolism'
 'cysteine and methionine metabolism' 'biotin metabolism'
 'terpenoid backbone biosynthesis' 'steroid hormone biosynthesis'
 'pathways in cancer' 'proteoglycans in cancer' 'zeatin biosynthesis'
 'bladder cancer']
