In [1]:
import time
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import pandas as pd
from urllib.parse import urlparse, urljoin, urldefrag
import socket
import queue
import threading
from bs4 import BeautifulSoup
import os
import hashlib
from datetime import datetime

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Suppress WebDriverManager logs
logging.getLogger('WDM').setLevel(logging.WARNING)

# Configure Selenium to use the Chrome browser
options = Options()
options.add_argument("--headless")  # Run in headless mode
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

# Connection pool for WebDriver instances
class WebDriverPool:
    def __init__(self, size):
        self.pool = queue.Queue(maxsize=size)
        self.lock = threading.Lock()
        self.init_drivers(size)

    def init_drivers(self, size):
        driver_path = ChromeDriverManager().install()
        for _ in range(size):
            self.pool.put(webdriver.Chrome(service=Service(driver_path), options=options))

    def get_driver(self):
        return self.pool.get()

    def release_driver(self, driver):
        self.pool.put(driver)

    def quit_all(self):
        while not self.pool.empty():
            driver = self.pool.get()
            driver.quit()

# Function to check if a link is internal
def is_internal(link, domain):
    return domain in urlparse(link).netloc

# Function to check if a link is a file (e.g., PDF)
def is_file(link):
    file_extensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.txt']
    return any(link.lower().endswith(ext) for ext in file_extensions)

# Normalize URLs to ensure trailing slashes and remove fragments
def normalize_url(url):
    parsed_url = urlparse(url)
    defragged_url = urldefrag(url).url  # Remove URL fragments
    if parsed_url.path == '':
        normalized = urlparse(defragged_url)._replace(path='/').geturl()
    else:
        normalized = defragged_url
    return normalized

# Function to scroll down the page to load more content
def scroll_page(driver, scroll_count=3, scroll_pause_time=2):
    """Scrolls down the page a few times to load dynamic content."""
    last_height = driver.execute_script("return document.body.scrollHeight")

    for i in range(scroll_count):
        # Scroll down to the bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load the page
        time.sleep(scroll_pause_time)

        # Calculate new scroll height and compare with the last height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            # If no new content loaded, stop scrolling
            break
        last_height = new_height

# Main page fetching function with scroll, URL filtering, and depth control
def fetch_page(url, domain, depth, driver, domain_dir, retries=3, scroll_count=3, required_string=None):
    try:
        url = normalize_url(url)
        logging.info(f"Crawling URL: {url}")

        # Exclude file links from scraping
        if is_file(url):
            path = urlparse(url).path
            section_parts = path.strip("/").split("/")
            section = f"/{section_parts[0]}" if section_parts[0] else "/"
            subdomain = urlparse(url).netloc.split('.')[0] if urlparse(url).netloc.count('.') > 1 else ''

            return {
                "url": url,
                "path": path,
                "page_title": None,
                "page_description": None,
                "internal_links": [],
                "internal_link_titles": [],
                "external_links": [],
                "clicks_away_from_root": depth,
                "html_content": None,
                "article_text": None,
                "screenshot": None,
                "scraped_datetime": datetime.utcnow().isoformat(),
                "type": "file",
                "page_loaded": False,
                "section": section,
                "subdomain": subdomain
            }

        # Ensure the URL contains the required string, if specified
        if required_string and required_string not in url:
            logging.info(f"Skipping URL (does not contain required string): {url}")
            return None

        # Load the page using Selenium
        driver.get(url)
        time.sleep(2)  # Wait for JavaScript to load

        # Scroll down the page to load more content
        scroll_page(driver, scroll_count)

        # Check if the page loaded by verifying if the title or a known element is present
        page_loaded = False
        try:
            title = driver.title
            if title:
                page_loaded = True
        except Exception as e:
            logging.error(f"Error checking load status for {url}: {e}")

        # Click "Allow all cookies" button if it exists (case-insensitive)
        try:
            allow_cookies_buttons = driver.find_elements(By.XPATH, "//button[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'allow all cookies')]")
            for button in allow_cookies_buttons:
                if button.is_displayed():
                    button.click()
                    time.sleep(1)  # Wait for the click to be processed
                    break
        except NoSuchElementException:
            logging.info("No 'Allow all cookies' button found")

        # Construct the screenshot filename
        url_path = urlparse(url).path.replace("/", "_")
        hash_digest = hashlib.md5(url.encode()).hexdigest()
        screenshot_filename = os.path.join(domain_dir, f"screenshot_{url_path}_{hash_digest}.png")
        relative_screenshot_path = os.path.relpath(screenshot_filename, start=os.getcwd())

        if page_loaded:
            # Take a screenshot of the entire page
            #original_size = driver.get_window_size()
            #required_width = driver.execute_script('return document.body.parentNode.scrollWidth')
            #required_height = driver.execute_script('return document.body.parentNode.scrollHeight')
            #driver.set_window_size(required_width, required_height)
            #driver.save_screenshot(screenshot_filename)
            #driver.set_window_size(original_size['width'], original_size['height'])  # Reset to original size
            pass

        title = driver.title if page_loaded else None
        description = driver.find_element(By.NAME, "description").get_attribute("content") if driver.find_elements(By.NAME, "description") and page_loaded else ""
        html_content = driver.page_source if page_loaded else None
        soup = BeautifulSoup(html_content, 'html.parser') if html_content else None
        article_text = soup.get_text(separator=' ', strip=True) if soup else None

        internal_links = set()
        internal_link_titles = []
        external_links = set()

        links = driver.find_elements(By.TAG_NAME, "a") if page_loaded else []
        for link in links:
            href = link.get_attribute("href")
            if href:
                href = normalize_url(urljoin(url, href))  # Ensure the URL is absolute and normalized
                if is_internal(href, domain):
                    internal_links.add(href)
                    # Fetch the title of the internal link
                    link_title = link.get_attribute("title") or link.text.strip()  # Use the title attribute or the link text
                    internal_link_titles.append(link_title)
                else:
                    external_links.add(href)

        # Extract the section from the path
        path = urlparse(url).path
        section_parts = path.strip("/").split("/")
        section = f"/{section_parts[0]}" if section_parts[0] else "/"
        subdomain = urlparse(url).netloc.split('.')[0] if urlparse(url).netloc.count('.') > 1 else ''

        # Add the datetime when the page was scraped
        scraped_datetime = datetime.utcnow().isoformat()

        return {
            "url": url,
            "path": path,
            "page_title": title,
            "page_description": description,
            "internal_links": list(internal_links),
            "internal_link_titles": internal_link_titles,
            "external_links": list(external_links),
            "clicks_away_from_root": depth,
            "html_content": html_content,
            "article_text": article_text,
            "screenshot": relative_screenshot_path if page_loaded else None,
            "scraped_datetime": scraped_datetime,
            "type": "page",
            "page_loaded": page_loaded,
            "section": section,
            "subdomain": subdomain
        }
    except StaleElementReferenceException as e:
        if retries > 0:
            logging.warning(f"StaleElementReferenceException encountered at {url}. Retrying... ({retries} retries left)")
            time.sleep(1)
            return fetch_page(url, domain, depth, driver, domain_dir, retries - 1, scroll_count)
        else:
            logging.error(f"Failed to fetch {url} after multiple retries due to StaleElementReferenceException")
    except Exception as e:
        logging.error(f"Error fetching {url}: {e}")
    return None

# Main function for crawling
#def crawl_website(root_url, max_depth=2, max_pages=100, domain=None, required_string=None):
#    if domain is None:
#        domain = urlparse(root_url).netloc
#
#    logging.info(f"Starting crawl on domain: {domain}")
#
#    driver_pool = WebDriverPool(size=10)  # Create a pool of 4 WebDriver instances
#
#    # Initialize the queue for BFS
#    url_queue = queue.Queue()
#    url_queue.put((root_url, 0))  # (url, depth)
#
#    # To keep track of visited URLs
#    visited = set()
#
#    # Create a directory to store domain-specific data
#    domain_dir = domain.replace('.', '_')
#    if not os.path.exists(domain_dir):
#        os.makedirs(domain_dir)
#
#    crawled_pages = []
#    with ThreadPoolExecutor(max_workers=10) as executor:
#        futures = {}
#        page_count = 0
#
#        while not url_queue.empty() and page_count < max_pages:
#            current_url, current_depth = url_queue.get()
#
#            if current_depth > max_depth:
#                continue
#
#            if current_url in visited:
#                continue
#
#            visited.add(current_url)
#
#            # Fetch and process the page asynchronously
#            driver = driver_pool.get_driver()
#            future = executor.submit(fetch_page, current_url, domain, current_depth, driver, domain_dir, scroll_count=3, required_string=required_string)
#            futures[future] = current_url
#
#            driver_pool.release_driver(driver)
#
#            # Collect results as they complete
#            for future in as_completed(futures):
#                result = future.result()
#                if result:
#                    crawled_pages.append(result)
#                    page_count += 1
#                    logging.info(f"Page count: {page_count}/{max_pages}")
#
#                    # Add internal links to the queue
#                    if current_depth + 1 <= max_depth:
#                        for internal_link in result.get('internal_links', []):
#                            if internal_link not in visited:
#                                url_queue.put((internal_link, current_depth + 1))
#
#                    # Stop if max pages are reached
#                    if page_count >= max_pages:
#                        break
#
#    driver_pool.quit_all()
#
#    # Save results to CSV
#    df = pd.DataFrame(crawled_pages)
#    df.to_csv(f"{domain_dir}/crawl_results.csv", index=False)
#    logging.info(f"Crawling finished. Results saved to {domain_dir}/crawl_results.csv")
#    return df

# Example usage
#root_url = "https://www.reddit.com/r/PharmaEire/"
#df = crawl_website(root_url, max_depth=1, max_pages=100, required_string='PharmaEire')

In [2]:
def crawl_website(root_url, max_depth=2, max_pages=100, domain=None, required_string=None):
    if domain is None:
        domain = urlparse(root_url).netloc

    logging.info(f"Starting crawl on domain: {domain}")

    driver_pool = WebDriverPool(size=10)  # Create a pool of 4 WebDriver instances

    # Initialize the queue for BFS
    url_queue = queue.Queue()
    url_queue.put((root_url, 0))  # (url, depth)

    # To keep track of visited URLs
    visited = set()

    # Create a directory to store domain-specific data
    domain_dir = domain.replace('.', '_')
    if not os.path.exists(domain_dir):
        os.makedirs(domain_dir)

    # Store crawled pages data
    crawled_pages = []
    
    # Use a set to quickly check for duplicates in the DataFrame
    crawled_urls = set()

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {}
        page_count = 0

        while not url_queue.empty() and page_count < max_pages:
            current_url, current_depth = url_queue.get()

            if current_depth > max_depth:
                continue

            if current_url in visited:
                continue

            visited.add(current_url)

            # Fetch and process the page asynchronously
            driver = driver_pool.get_driver()
            future = executor.submit(fetch_page, current_url, domain, current_depth, driver, domain_dir, scroll_count=3, required_string=required_string)
            futures[future] = current_url

            driver_pool.release_driver(driver)

            # Collect results as they complete
            for future in as_completed(futures):
                result = future.result()
                if result:
                    # Check if the URL already exists in the crawled_urls set
                    if result['url'] not in crawled_urls:
                        crawled_pages.append(result)
                        crawled_urls.add(result['url'])  # Add the URL to the set
                        page_count += 1
                        logging.info(f"Page count: {page_count}/{max_pages}")

                        # Add internal links to the queue
                        if current_depth + 1 <= max_depth:
                            for internal_link in result.get('internal_links', []):
                                if internal_link not in visited:
                                    url_queue.put((internal_link, current_depth + 1))

                    # Stop if max pages are reached
                    if page_count >= max_pages:
                        break

    driver_pool.quit_all()

    # Save results to CSV
    df = pd.DataFrame(crawled_pages)
    df.to_csv(f"{domain_dir}/crawl_results.csv", index=False)
    logging.info(f"Crawling finished. Results saved to {domain_dir}/crawl_results.csv")
    return df

In [3]:
#root_url = "https://www.reddit.com/r/PharmaEire/"
root_url = 'https://www.reddit.com/r/PharmaEire/search/?q=regeneron&type=link&cId=89f61aa7-1cc3-46e7-9d99-0525539c8507&iId=9439a650-390d-42c8-ba77-e103b4f1333b'
df = crawl_website(root_url, max_depth=1, max_pages=100, required_string='PharmaEire')

2024-09-17 15:20:48,623 - INFO - Starting crawl on domain: www.reddit.com
2024-09-17 15:20:53,332 - INFO - Crawling URL: https://www.reddit.com/r/PharmaEire/search/?q=regeneron&type=link&cId=89f61aa7-1cc3-46e7-9d99-0525539c8507&iId=9439a650-390d-42c8-ba77-e103b4f1333b
2024-09-17 15:21:04,891 - INFO - Page count: 1/100
2024-09-17 15:21:04,891 - INFO - Crawling URL: https://www.reddit.com/r/PharmaEire/comments/1fies4e/salary/
2024-09-17 15:21:13,249 - INFO - Page count: 2/100
2024-09-17 15:21:13,249 - INFO - Crawling URL: https://www.reddit.com/r/PharmaEire/comments/11uoeix/how_do_people_where_you_work_describe_other/
2024-09-17 15:21:22,543 - INFO - Page count: 3/100
2024-09-17 15:21:22,543 - INFO - Crawling URL: https://www.reddit.com/r/PharmaEire/comments/1dtfjgs/associate_biotech_production_specialist_regeneron/
2024-09-17 15:21:28,789 - INFO - Page count: 4/100
2024-09-17 15:21:28,789 - INFO - Crawling URL: https://www.reddit.com/r/PharmaEire/
2024-09-17 15:21:40,082 - INFO - Page c

In [4]:
df

Unnamed: 0,url,path,page_title,page_description,internal_links,internal_link_titles,external_links,clicks_away_from_root,html_content,article_text,screenshot,scraped_datetime,type,page_loaded,section,subdomain
0,https://www.reddit.com/r/PharmaEire/search/?q=...,/r/PharmaEire/search/,regeneron - Reddit Search!,,[https://www.reddit.com/r/PharmaEire/comments/...,"[Skip to main content, , Log In, , , Comments,...","[https://reddit.com/t/adventure_games/, https:...",0,"<html lang=""en-US"" class=""is-search-results-pa...",regeneron - Reddit Search! Skip to main conten...,www_reddit_com/screenshot__r_PharmaEire_search...,2024-09-17T19:21:04.891183,page,True,/r,www
1,https://www.reddit.com/r/PharmaEire/comments/1...,/r/PharmaEire/comments/1fies4e/salary/,Salary : r/PharmaEire,,[https://www.reddit.com/r/irishpersonalfinance...,"[, , Log In, , , Go to PharmaEire, r/PharmaEir...","[https://reddit.com/t/adventure_games/, https:...",1,"<html lang=""en-US"" class=""is-shredtop-pdp them...",Salary : r/PharmaEire Skip to main content Sal...,www_reddit_com/screenshot__r_PharmaEire_commen...,2024-09-17T19:21:13.249326,page,True,/r,www
2,https://www.reddit.com/r/PharmaEire/comments/1...,/r/PharmaEire/comments/11uoeix/how_do_people_w...,How do people where you work describe other co...,,[https://www.reddit.com/r/ireland/comments/1fa...,"[Skip to main content, , Log In, , , Go to Pha...","[https://reddit.com/t/adventure_games/, https:...",1,"<html lang=""en-US"" class=""is-shredtop-pdp them...",How do people where you work describe other co...,www_reddit_com/screenshot__r_PharmaEire_commen...,2024-09-17T19:21:22.543255,page,True,/r,www
3,https://www.reddit.com/r/PharmaEire/comments/1...,/r/PharmaEire/comments/1dtfjgs/associate_biote...,Associate Biotech Production Specialist- Regen...,,"[https://www.reddit.com/user/rich3248/, https:...","[, , Log In, , , Go to PharmaEire, r/PharmaEir...","[https://reddit.com/t/adventure_games/, https:...",1,"<html lang=""en-US"" class=""is-shredtop-pdp them...",Associate Biotech Production Specialist- Regen...,www_reddit_com/screenshot__r_PharmaEire_commen...,2024-09-17T19:21:28.789386,page,True,/r,www
4,https://www.reddit.com/r/PharmaEire/,/r/PharmaEire/,PharmaEire,,"[https://www.reddit.com/user/ajeganwalsh/, htt...","[Skip to main content, , Log In, , , , , , , ,...","[https://reddit.com/t/adventure_games/, https:...",1,"<html lang=""en-US"" class=""theme-beta theme-dar...",PharmaEire Skip to main content PharmaEire Ope...,www_reddit_com/screenshot__r_PharmaEire__27da0...,2024-09-17T19:21:40.082723,page,True,/r,www
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,https://www.reddit.com/r/PharmaEire/comments/1...,/r/PharmaEire/comments/1f2emay/westpharma_phon...,Westpharma Phone Interview : r/PharmaEire,,[https://www.reddit.com/r/PharmaEire/comments/...,"[Skip to main content, , Log In, , , Go to Pha...","[https://reddit.com/t/adventure_games/, https:...",1,"<html lang=""en-US"" class=""is-shredtop-pdp them...",Westpharma Phone Interview : r/PharmaEire Ski...,www_reddit_com/screenshot__r_PharmaEire_commen...,2024-09-17T19:29:02.426898,page,True,/r,www
67,https://www.reddit.com/r/PharmaEire/comments/1...,/r/PharmaEire/comments/1d3385i/business_in_the...,Business in the Emerald Isle is booming as the...,,[https://www.reddit.com/r/ireland/comments/1f1...,"[, , Log In, , , Go to PharmaEire, r/PharmaEir...","[https://reddit.com/t/adventure_games/, https:...",1,"<html lang=""en-US"" class=""is-shredtop-pdp them...",Business in the Emerald Isle is booming as the...,www_reddit_com/screenshot__r_PharmaEire_commen...,2024-09-17T19:29:10.320702,page,True,/r,www
68,https://www.reddit.com/r/PharmaEire/comments/1...,/r/PharmaEire/comments/1bwciur/process_scienti...,Process scientist role pay scale in regeneron ...,,[https://www.reddit.com/r/ireland/comments/1fa...,"[Skip to main content, , Log In, , , Go to Pha...","[https://reddit.com/t/adventure_games/, https:...",1,"<html lang=""en-US"" class=""is-shredtop-pdp them...",Process scientist role pay scale in regeneron ...,www_reddit_com/screenshot__r_PharmaEire_commen...,2024-09-17T19:29:18.801833,page,True,/r,www
69,https://www.reddit.com/r/PharmaEire/comments/1...,/r/PharmaEire/comments/18i5242/where_do_i_actu...,Where do I actually apply for operator roles i...,,"[https://www.reddit.com/user/rich3248/, https:...","[Skip to main content, , Log In, , , Go to Pha...","[https://reddit.com/t/adventure_games/, https:...",1,"<html lang=""en-US"" class=""is-shredtop-pdp them...",Where do I actually apply for operator roles i...,www_reddit_com/screenshot__r_PharmaEire_commen...,2024-09-17T19:29:27.276779,page,True,/r,www


In [5]:
df_clean = df[df['path'] != '/r/PharmaEire/search/']

In [6]:
df_clean

Unnamed: 0,url,path,page_title,page_description,internal_links,internal_link_titles,external_links,clicks_away_from_root,html_content,article_text,screenshot,scraped_datetime,type,page_loaded,section,subdomain
1,https://www.reddit.com/r/PharmaEire/comments/1...,/r/PharmaEire/comments/1fies4e/salary/,Salary : r/PharmaEire,,[https://www.reddit.com/r/irishpersonalfinance...,"[, , Log In, , , Go to PharmaEire, r/PharmaEir...","[https://reddit.com/t/adventure_games/, https:...",1,"<html lang=""en-US"" class=""is-shredtop-pdp them...",Salary : r/PharmaEire Skip to main content Sal...,www_reddit_com/screenshot__r_PharmaEire_commen...,2024-09-17T19:21:13.249326,page,True,/r,www
2,https://www.reddit.com/r/PharmaEire/comments/1...,/r/PharmaEire/comments/11uoeix/how_do_people_w...,How do people where you work describe other co...,,[https://www.reddit.com/r/ireland/comments/1fa...,"[Skip to main content, , Log In, , , Go to Pha...","[https://reddit.com/t/adventure_games/, https:...",1,"<html lang=""en-US"" class=""is-shredtop-pdp them...",How do people where you work describe other co...,www_reddit_com/screenshot__r_PharmaEire_commen...,2024-09-17T19:21:22.543255,page,True,/r,www
3,https://www.reddit.com/r/PharmaEire/comments/1...,/r/PharmaEire/comments/1dtfjgs/associate_biote...,Associate Biotech Production Specialist- Regen...,,"[https://www.reddit.com/user/rich3248/, https:...","[, , Log In, , , Go to PharmaEire, r/PharmaEir...","[https://reddit.com/t/adventure_games/, https:...",1,"<html lang=""en-US"" class=""is-shredtop-pdp them...",Associate Biotech Production Specialist- Regen...,www_reddit_com/screenshot__r_PharmaEire_commen...,2024-09-17T19:21:28.789386,page,True,/r,www
4,https://www.reddit.com/r/PharmaEire/,/r/PharmaEire/,PharmaEire,,"[https://www.reddit.com/user/ajeganwalsh/, htt...","[Skip to main content, , Log In, , , , , , , ,...","[https://reddit.com/t/adventure_games/, https:...",1,"<html lang=""en-US"" class=""theme-beta theme-dar...",PharmaEire Skip to main content PharmaEire Ope...,www_reddit_com/screenshot__r_PharmaEire__27da0...,2024-09-17T19:21:40.082723,page,True,/r,www
5,https://www.reddit.com/r/PharmaEire/comments/1...,/r/PharmaEire/comments/14qbrqb/opportunity_at_...,Opportunity at Zimmer : r/PharmaEire,,[https://www.reddit.com/r/PharmaEire/comments/...,"[, , Log In, , , Go to PharmaEire, r/PharmaEir...","[https://reddit.com/t/adventure_games/, https:...",1,"<html lang=""en-US"" class=""is-shredtop-pdp them...",Opportunity at Zimmer : r/PharmaEire Skip to m...,www_reddit_com/screenshot__r_PharmaEire_commen...,2024-09-17T19:21:46.400311,page,True,/r,www
6,https://www.reddit.com/r/PharmaEire/comments/1...,/r/PharmaEire/comments/1ai59xx/manufacturing_o...,Manufacturing Opportunity at Regeneron : r/Pha...,,[https://www.reddit.com/r/PharmaEire/comments/...,"[Skip to main content, , Log In, , , Go to Pha...","[https://reddit.com/t/adventure_games/, https:...",1,"<html lang=""en-US"" class=""is-shredtop-pdp them...",Manufacturing Opportunity at Regeneron : r/Pha...,www_reddit_com/screenshot__r_PharmaEire_commen...,2024-09-17T19:21:55.159384,page,True,/r,www
7,https://www.reddit.com/r/PharmaEire/comments/1...,/r/PharmaEire/comments/14qa5t9/my_impression_o...,My impression of Regeneron from yesterday's po...,,[https://www.reddit.com/r/ireland/comments/1fa...,"[Skip to main content, , Log In, , , Go to Pha...","[https://reddit.com/t/adventure_games/, https:...",1,"<html lang=""en-US"" class=""is-shredtop-pdp them...",My impression of Regeneron from yesterday's po...,www_reddit_com/screenshot__r_PharmaEire_commen...,2024-09-17T19:22:03.686749,page,True,/r,www
9,https://www.reddit.com/r/PharmaEire/comments/1...,/r/PharmaEire/comments/1cgv5wq/qc_analyst_samp...,QC analyst Samples management role in Regenero...,,"[https://www.reddit.com/user/TrivialFacts/, ht...","[, , Log In, , , Go to PharmaEire, r/PharmaEir...","[https://reddit.com/t/adventure_games/, https:...",1,"<html lang=""en-US"" class=""is-shredtop-pdp them...",QC analyst Samples management role in Regenero...,www_reddit_com/screenshot__r_PharmaEire_commen...,2024-09-17T19:22:20.420391,page,True,/r,www
10,https://www.reddit.com/r/PharmaEire/comments/1...,/r/PharmaEire/comments/1bvq2mx/update_job_scre...,[UPDATE] job screening background check in the...,,[https://www.reddit.com/r/DevelEire/comments/1...,"[, , Log In, , , Go to PharmaEire, r/PharmaEir...","[https://reddit.com/t/adventure_games/, https:...",1,"<html lang=""en-US"" class=""is-shredtop-pdp them...",[UPDATE] job screening background check in the...,www_reddit_com/screenshot__r_PharmaEire_commen...,2024-09-17T19:22:26.254156,page,True,/r,www
11,https://www.reddit.com/r/PharmaEire/comments/1...,/r/PharmaEire/comments/1ccbdet/do_any_regenero...,Do any Regeneron employees know if it offers h...,,[https://www.reddit.com/tldr/best-hybrid-techn...,"[Skip to main content, , Log In, , , Go to Pha...","[https://reddit.com/t/adventure_games/, https:...",1,"<html lang=""en-US"" class=""is-shredtop-pdp them...",Do any Regeneron employees know if it offers h...,www_reddit_com/screenshot__r_PharmaEire_commen...,2024-09-17T19:22:32.638437,page,True,/r,www


In [7]:
All_comments = []
for i in range(len(df_clean)):
    test_string = df_clean['html_content'].to_list()[i]
    All_comments += [sub_string.split('</p>')[0] for sub_string in test_string.split('<p>')[1:]][4:]

In [8]:
len(All_comments)

705

In [9]:

All_comments=[i.replace('\n','') for i in All_comments]
All_comments=[i.replace('  ',' ') for i in All_comments]
All_comments=[i.replace('  ',' ') for i in All_comments]
All_comments=[i.replace('  ',' ') for i in All_comments]
All_comments


[' Agree with Regeneron - think I’ve PTSD from working there. Site head has the worst reputation too. He’s a snobby prick. Treat contractors like shit no wonder they can’t hold onto them ',
 " Oh I definitely do too. I had some contractors reporting into me and it was scary what management made me do. Like I had to fire one guy after 3 days and I wasn't told the reason. Also was verbally abused in and outside of work but someone who is now at AD level, of course because of his reputation in work as a brilliant manager HR and my department head did absolutely nothing. Shit company, shit pay. I hope eli lilly drain them of staff. ",
 " I'm glad everything I heard was true about that place lol ",
 ' My impression of Pfizer is that they are ruthless but efficient ',
 ' Pfizer - Heavily paranoid on Safety which is fair, but sometimes it wastes half the day on permits, toolbox talks etc. Otherwise they are efficient. Never really here of anyone complaining about working there bar the safety 

In [10]:
# test_string

In [11]:
# test_string.split('<shreddit-comment author')[1:]

In [12]:
All_comments_word_cloud_1 = All_comments


## Create Word Cloud


In [13]:
import os
os.environ['MPLBACKEND'] = 'TkAgg'
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from PIL import Image


In [14]:
import nltk
# nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import more_itertools as mit
from collections import Counter
from itertools import chain
from nltk.stem import WordNetLemmatizer

In [15]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd

In [16]:
lemmatizer = WordNetLemmatizer()

In [17]:
dic = {}
# All_comments_word_cloud_1

In [18]:
for j in All_comments_word_cloud_1:
    tokens = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(j.translate(str.maketrans('', '', string.punctuation))) if word.lower() not in stopwords.words()]
    # bgs = nltk.trigrams(tokens)
    bgs = nltk.bigrams(tokens)
    fdist = nltk.FreqDist(bgs)
    # fdist = nltk.FreqDist(tokens)
    
    for k,v in fdist.items():
        if k in dic.keys():
            dic[k] += v
        else:
            dic[k] = v
            
# All_comments_word_cloud_1
bgs

<generator object bigrams at 0x17506b920>

In [19]:
ascending = sorted(dic.items(), key=lambda kv: kv[1], reverse=True)
scores = {}
ascending
for i in ascending:
    print(i)
    # scores[i[0][0] + ' ' + i[0][1] + ' ' + i[0][2]] = i[1]
    scores[i[0][0] + ' ' + i[0][1]] = i[1]

    

(('ahahaahah', 'regeneron'), 24)
(('regeneron', 'ahahahahah'), 24)
(('ahahahahah', 'regeneron'), 24)
(('regeneron', 'ahahaahah'), 23)
(('pharma', 'company'), 16)
(('eli', 'lilly'), 13)
(('rpl', 'classrelative'), 12)
(('classrelative', 'pointereventsauto'), 12)
(('pointereventsauto', 'cursorpointer'), 12)
(('cursorpointer', 'underline'), 12)
(('relnoopener', 'nofollow'), 12)
(('nofollow', 'ugc'), 12)
(('place', 'work'), 8)
(('operator', 'role'), 8)
(('year', 'ago'), 8)
(('ive', 'worked'), 7)
(('10', 'year'), 7)
(('work', 'environment'), 7)
(('financial', 'advisor'), 6)
(('’', 'heard'), 6)
(('experience', '’'), 6)
(('ive', 'heard'), 5)
(('6', 'month'), 5)
(('hope', 'help'), 5)
(('entry', 'level'), 5)
(('regulatory', 'affair'), 5)
(('year', 'experience'), 5)
(('medical', 'device'), 5)
(('beckman', 'coulter'), 5)
(('2', 'year'), 5)
(('12', 'month'), 5)
(('stepping', 'stone'), 5)
(('3', 'day'), 4)
(('3rd', 'party'), 4)
(('party', 'comparison'), 4)
(('comparison', 'site'), 4)
(('debt', 'reli

In [20]:
# ascending
# wc = WordCloud().generate(All_comments_word_cloud_1' '.join(All_comments))
# wc = WordCloud().generate(All_comments)
wc = WordCloud().generate(' '.join(All_comments_word_cloud_1))
plt.imshow(wc)
plt.show()



In [21]:
wc = WordCloud(background_color="white", max_words=50).generate_from_frequencies(scores)
# wc = WordCloud(background_color="white", max_words=50).generate_from_frequencies(ascending)
plt.axis('off')
plt.figure(figsize=(10, 5))
plt.imshow(wc)
plt.show()

invalid command name "10827885760process_stream_events"
    while executing
"10827885760process_stream_events"
    ("after" script)
can't invoke "event" command: application has been destroyed
    while executing
"event generate $w <<ThemeChanged>>"
    (procedure "ttk::ThemeChanged" line 6)
    invoked from within
"ttk::ThemeChanged"


: 