In [1]:
#! pip install webdriver_manager
import requests
from pprint import pprint
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from time import sleep
import os
import sys
import random

In [14]:
import re
from typing import List, Dict
from pprint import pprint
import json
import requests
from bs4 import BeautifulSoup
from time import sleep
import os
import random
from tqdm import tqdm

class PubMedRecord:
    def __init__(self, raw_data: str):
        self.raw_data = raw_data
        self.chunks = self._make_pmid_chungs()
        self.parsed_raw = self._extract_all()
        self.parsed = self._sim_id_to_list()
    
    def filter_abstract_4_names(self, author):
        filtered_chunks = [entry for entry in self.parsed if author in entry['FAU']]
        # get the last chunk which does not contain the author, then we will stop scraping
        index = 0
        for entry in self.parsed:
            if author not in entry['FAU']:
                break
            index += 1
        return filtered_chunks, index 

    def _make_pmid_chungs(self):
        # find the first pmid occurence and put all text until the next pmid into a list, using regex is appropriate
        removed_start = self.raw_data[re.search(r'\bPMID\s*-\s*\d+', self.raw_data).start():].strip()
        split_data =  re.split(r'(?=PMID\s*-\s*\d+|PMID-\s*\d+)', removed_start)[1:]
        return split_data
    
    def _extract_all(self):
        results = []
        for chunk in self.chunks:
            # remove all newlines
            chunk = chunk.replace('\n', ' ')
            # extract all the fields using regex
            results.append(re.findall(r'([A-Z]{2,6})\s*-\s*(.+?)(?=\s[A-Z]{2,6}\s*-\s*|\Z)', chunk.strip()))
        return results

    def _sim_id_to_list(self):
        self.parsed = []
        for chunk_raw in self.parsed_raw:
            chunk_parsed = {}
            for key, value in chunk_raw:
                if key not in chunk_parsed:
                    chunk_parsed[key] = []
                chunk_parsed[key].append(value.replace('\r', ''))
            self.parsed.append(chunk_parsed)
        return self.parsed

class SinglePubMedSearcher:
    def __init__(self, author):
        self.author = author
        self.output_dir = f'results/{author.replace(", ", "_")}/processed'
        self.raw_dir = f'results/{author.replace(", ", "_")}/raw'
        os.makedirs(self.output_dir, exist_ok=True)
        os.makedirs(self.raw_dir, exist_ok=True)

    def author_url(self, page):
        """Construct the PubMed search URL for the given author and page."""
        return f'https://pubmed.ncbi.nlm.nih.gov/?term={self.author.replace(" ", "+")}%5Bauthor%5D&format=pubmed&size=200&page={page}'

    def save_chunks(self, filtered_chunks):
        """Save filtered chunks to JSON files."""
        for chunk in filtered_chunks:
            pmid = chunk.get('PMID', None)
            
            if pmid is not None:
                pmid_cleaned = ''.join(filter(str.isalnum, pmid[0]))  # Clean the PMID
                file_path = os.path.join(self.output_dir, f'{pmid_cleaned}.json')

                with open(file_path, 'w') as file:
                    json.dump(chunk, file, ensure_ascii=False, indent=4)  # Save as pretty JSON

    def search_author(self):
        """Search for publications by the specified author on PubMed."""
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive'
        }
        
        current_page = 1
        while current_page < 6:
            url_pubmed = self.author_url(current_page)
            response = requests.get(url_pubmed, headers=headers)

            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                parsed = PubMedRecord(str(soup.get_text()))  # Assuming PubMedRecord is defined elsewhere
                filtered_chunks, index = parsed.filter_abstract_4_names(self.author)
                #print(f"Page {url_pubmed} has {len(filtered_chunks)} entries, index: {index}")

                self.save_chunks(filtered_chunks)  # Save filtered results
                
                # Save raw response
                with open(os.path.join(self.raw_dir, f'{current_page}.html'), 'w') as file:
                    file.write(str(soup))

                sleep(random.uniform(5, 10))  # Simulate human-like delay
            else:
                break

            current_page += 1

class MultPubMedSearcher:
    def __init__(self, root_author, depth=1):
        self.root_author = root_author
        self.depth = depth
        self.searched_authors = [root_author]
        self.metadata = {'authors': [root_author], 'depth': [depth]}
    
    def get_new_authors(self):
        # extract from the hole results directory all authors, the authors are saved 
        mom_dir = 'results/'
        son_dir = 'processed'
        author_dirs = os.listdir(mom_dir)
        file_paths = [
            os.path.join(mom_dir, author, son_dir, file)
            for author in author_dirs
            for file in os.listdir(os.path.join(mom_dir, author, son_dir))
            if os.path.isdir(os.path.join(mom_dir, author, son_dir))  # Check if processed dir exists
        ]
        authors = []
        for file_path in file_paths:
            with open(file_path, 'r') as file:
                data = json.load(file)
                authors.extend(data['FAU'])
        return authors

    def search(self):
        # First search the root author
        searcher = SinglePubMedSearcher(self.root_author)
        searcher.search_author()
        
        # Get new authors
        for i in range(1, self.depth):
            authors = self.get_new_authors()
            
            # Use tqdm to show progress
            for author in tqdm(authors, desc=f'Searching authors (Depth {i})', unit='author'):
                if author not in self.searched_authors:
                    self.searched_authors.append(author)
                    searcher = SinglePubMedSearcher(author)
                    searcher.search_author()

searcher = MultPubMedSearcher('Mishra, Neha', 2)
searcher.search()


https://pubmed.ncbi.nlm.nih.gov/?term=Mishra,+Neha%5Bauthor%5D&format=pubmed&size=200&page=1
Page https://pubmed.ncbi.nlm.nih.gov/?term=Mishra,+Neha%5Bauthor%5D&format=pubmed&size=200&page=1 has 111 entries, index: 111
https://pubmed.ncbi.nlm.nih.gov/?term=Mishra,+Neha%5Bauthor%5D&format=pubmed&size=200&page=2
4036
['Aden, Konrad', 'Bartsch, Kareen', 'Dahl, Joseph', 'Reijns, Martin A M', 'Esser, Daniela', 'Sheibani-Tezerji, Raheleh', 'Sinha, Anupam', 'Wottawa, Felix', 'Ito, Go', 'Mishra, Neha', 'Knittler, Katharina', 'Burkholder, Adam', 'Welz, Lina', 'van Es, Johan', 'Tran, Florian', 'Lipinski, Simone', 'Kakavand, Nassim', 'Boeger, Christine', 'Lucius, Ralph', 'von Schoenfels, Witigo', 'Schafmayer, Clemens', 'Lenk, Lennart', 'Chalaris, Athena', 'Clevers, Hans', 'Röcken, Christoph', 'Kaleta, Christoph', 'Rose-John, Stefan', 'Schreiber, Stefan', 'Kunkel, Thomas', 'Rabe, Björn', 'Rosenstiel, Philip', 'Saggau, Carina', 'Bacher, Petra', 'Esser, Daniela', 'Rasa, Mahdi', 'Meise, Silja', 'Mohr

KeyboardInterrupt: 

In [1]:
import re
from typing import List, Dict
from pprint import pprint
import json
import requests
from bs4 import BeautifulSoup
from time import sleep
import os
import random
from tqdm import tqdm

class PubMedRecord:
    def __init__(self, raw_data: str):
        self.raw_data = raw_data
        self.chunks = self._make_pmid_chungs()
        self.parsed_raw = self._extract_all()
        self.parsed = self._sim_id_to_list()
        pprint(self.parsed)
    def filter_abstract_4_names(self, author):
        filtered_chunks = [entry for entry in self.parsed if author in entry['FAU']]
        # get the last chunk which does not contain the author, then we will stop scraping
        index = 0
        for entry in self.parsed:
            if author not in entry['FAU']:
                break
            index += 1
        return filtered_chunks, index 

    def _make_pmid_chungs(self):
        # find the first pmid occurence and put all text until the next pmid into a list, using regex is appropriate
        removed_start = self.raw_data[re.search(r'\bPMID\s*-\s*\d+', self.raw_data).start():].strip()
        split_data =  re.split(r'(?=PMID\s*-\s*\d+|PMID-\s*\d+)', removed_start)[1:]
        return split_data
    
    def _extract_all(self):
        results = []
        for chunk in self.chunks:
            # remove all newlines
            chunk = chunk.replace('\n', ' ')
            # extract all the fields using regex
            results.append(re.findall(r'(^|\s)([A-Z]{2,6})\s*-\s*(.+?)(?=\s[A-Z]{2,6}\s*-\s*|\Z)', chunk.strip()))
        return results

    def _sim_id_to_list(self):
        self.parsed = []
        for chunk_raw in self.parsed_raw:
            chunk_parsed = {}
            for key, value in chunk_raw:
                if key not in chunk_parsed:
                    chunk_parsed[key] = []
                chunk_parsed[key].append(value.replace('\r', ''))
            self.parsed.append(chunk_parsed)
        return self.parsed
    
file_path = '/home/tom-ruge/Schreibtisch/Fachhochschule/Semester_2/Social Media Analytics/StalkYourProf/results/Aden_Konrad/raw/1.html'
with open(file_path, 'r') as file:
    data = file.read()
    # make soup object
    soup = BeautifulSoup(data, 'html.parser')
    parsed = PubMedRecord(str(soup.get_text()))  # Assuming PubMedRecord is defined elsewhere
    filtered_chunks, index = parsed.filter_abstract_4_names('Aden, Konrad')
    print(f"Found {len(filtered_chunks)} entries, index: {index}")

[{'AB': ['OBJECTIVE: One of the current hypotheses to explain the '
         'proinflammatory immune        response in IBD is a dysregulated T '
         'cell reaction to yet unknown intestinal        antigens. As such, it '
         'may be possible to identify disease-associated T cell        '
         'clonotypes by analysing the peripheral and intestinal T-cell '
         'receptor (TCR)        repertoire of patients with IBD and controls. '
         'DESIGN: We performed bulk TCR        repertoire profiling of both '
         'the TCR alpha and beta chains using high-throughput        '
         'sequencing in peripheral blood samples of a total of 244 patients '
         'with IBD and        healthy controls as well as from matched blood '
         'and intestinal tissue of 59        patients with IBD and disease '
         'controls. We further characterised specific T cell        clonotypes '
         'via single-cell RNAseq. RESULTS: We identified a group of '
         'clo

In [2]:
! pip install pyvis
from pyvis.network import Network
import networkx as nx

# Create a NetworkX graph
G = nx.cycle_graph(10)

# Create a PyVis network
net = Network(notebook=True, height='600px', width='100%')
net.from_nx(G)

# Customize the appearance
net.show_buttons(filter_=['physics'])
net.set_options("""
var options = {
  "nodes": {
    "shape": "dot",
    "size": 20
  },
  "physics": {
    "barnesHut": {
      "gravitationalConstant": -8000,
      "centralGravity": 0.3,
      "springLength": 95,
      "springConstant": 0.04
    }
  }
}
""")

# Display the network
net.show("network.html")


Collecting pyvis
  Downloading pyvis-0.3.2-py3-none-any.whl.metadata (1.7 kB)
Collecting jsonpickle>=1.4.1 (from pyvis)
  Downloading jsonpickle-3.3.0-py3-none-any.whl.metadata (8.3 kB)
Downloading pyvis-0.3.2-py3-none-any.whl (756 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m756.0/756.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jsonpickle-3.3.0-py3-none-any.whl (42 kB)
Installing collected packages: jsonpickle, pyvis
Successfully installed jsonpickle-3.3.0 pyvis-0.3.2
network.html
