In [1]:
"""
This script scrapes links from multiple pages of a website, filters and processes the links, and saves statistical data into separate CSV files.

1. Scraping Links:
   - The script uses Selenium WebDriver to scrape links from multiple pages of the website.
   - It scrolls down each page to load dynamic content and collects all links.
   - The collected links are filtered to include only those starting with a specified prefix.

2. Processing Links:
   - Unique base links are extracted from the filtered links.
   - For each link in the list:
     - If the link does not exist in the DataFrame 'data_df', it fetches HTML code, extracts statistical data, and appends it to the DataFrame.

3. Saving Data:
   - After processing, the forum data is saved into pickle file for later use.
Note: 
- Ensure the ChromeDriver executable is placed in the correct path for Selenium to work.
- Adjust the number of pages to scrape and other parameters as needed.
"""
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
import time
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options




In [2]:
# Record the start time
start_time = time.time()

In [3]:
def get_html_code_of_given_url(url):
    """
    Gets the HTML code of a given URL using Selenium WebDriver.

    Args:
    url (str): The URL to fetch the HTML code from.

    Returns:
    str: The HTML code of the URL.
    """
    # Path to your ChromeDriver executable
    chrome_driver_path = "C:\\Users\\mehta\\OneDrive\\Työpöytä\\chromedriver.exe"
    


    # Set up Chrome options (optional)
    chrome_options = webdriver.ChromeOptions()
    # Run Chrome in headless mode (without opening the browser window)
    chrome_options.add_argument('--headless')
    # Create a webdriver instance
    service = Service(chrome_driver_path)
    driver = webdriver.Chrome(service=service, options=chrome_options)

    # Navigate to the URL
    driver.get(url)
    

    # Scroll to the bottom of the page (or perform the scrolling logic you need)
    # Here, it scrolls 30 times with a small delay to load more content
    for i in range(20):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)  # Adjust the delay as needed

    html = driver.page_source

    # Close the browser window
    driver.quit()
    return html


In [4]:
# Sample DataFrame to collect results
def get_statistics_from_website(html, data_df, url):
    soup = BeautifulSoup(html, 'html.parser')

    # Collect statistics data
    statistics = {
        'URL': url,
        'Created At': None,
        'Last Reply': None,
        'Visits': None,
        'Replies': None,
        'Users': None,
        'Likes': None,
        'Comments Details': []
    }

    # Extract various statistics
    views_element = soup.find('li', class_='secondary views')
    if views_element:
        span_element = views_element.find('span', class_='number heatmap-med') or views_element.find('span', class_='number')
        if span_element:
            statistics['Visits'] = span_element.text.strip()

    created_at_element = soup.find('li', class_='created-at')
    if created_at_element:
        span_element = created_at_element.find('div', class_='topic-map-post created-at')
        if span_element:
            statistics['Created At'] = span_element.text.strip()

    replies_element = soup.find('li', class_='replies')
    if replies_element:
        span_element = replies_element.find('span', class_='number')
        if span_element:
            statistics['Replies'] = span_element.text.strip()

    last_reply_element = soup.find('li', class_='last-reply')
    if last_reply_element:
        div_element = last_reply_element.find('div', class_='topic-map-post last-reply')
        if div_element:
            statistics['Last Reply'] = div_element.text.strip()

    users_element = soup.find('li', class_='secondary users')
    if users_element:
        span_element = users_element.find('span', class_='number')
        if span_element:
            statistics['Users'] = span_element.text.strip()

    likes_element = soup.find('li', class_='secondary likes')
    if likes_element:
        span_element = likes_element.find('span', class_='number')
        if span_element:
            statistics['Likes'] = span_element.text.strip()

    # Collect comments data and their 'post_id'
    posts = soup.find_all('article', class_='boxed onscreen-post')
    for post in posts:
        try:
            # Retrieve the post_id from an identifier in the article tag (example: 'data-post-id')
            post_id = post.get('id', None)

            # Collect the comments
            comments = post.find('div', class_='regular contents')
            if comments:
                for comment in comments.find_all('p'):
                    comment_text = comment.text.strip()

                    # Find comment likes
                    likes = 0
                    likes_element = comment.find('button', 
                        class_='widget-button btn-flat button-count like-count highlight-action regular-likes btn-icon-text')
                    if likes_element:
                        match = re.match(r'^\d+', likes_element.text)
                        if match:
                            likes = int(match.group(0))

                    # Find timestamp if possible
                    timestamp = None
                    post_info_div = comment.find_parent('article', class_='boxed onscreen-post').find('div', class_='post-infos')
                    if post_info_div:
                        timestamp_element = post_info_div.find('span', class_='relative-date')
                        if timestamp_element and 'title' in timestamp_element.attrs:
                            timestamp = timestamp_element['title']

                    # Append comment details with post_id to the statistics dictionary
                    statistics['Comments Details'].append({
                        "comment": comment_text,
                        "timestamp": timestamp,
                        "likes": likes,
                        "post_id": post_id  # Store the post_id
                    })
        except Exception as e:
            print(f"Error processing comment: {e}")

    # Create a new DataFrame from collected data
    new_data = pd.DataFrame([statistics])
    
    # Append new data to existing DataFrame
    data_df = pd.concat([data_df, new_data], ignore_index=True)
    
    print(f"Processed URL: {url}")
    
    return data_df


In [5]:
def scrape_links_multiple_pages(url, num_pages=30):
    """
    Scrape links from multiple pages by scrolling down and collecting links.

    Args:
    url (str): The URL of the page to start scraping from.
    num_pages (int): The number of pages to scrape.

    Returns:
    list: A list of links collected from multiple pages.
    """
    # Set up Chrome options for headless mode
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')  # Optional but recommended for headless mode

    # Initialize Chrome WebDriver with headless options
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(url)

    # Scroll and wait for dynamic content to load multiple times
    for _ in range(num_pages):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Adjust sleep time as needed

        # Wait for dynamic content to load
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "a")))

    # Collect all links on the page
    all_links = [link.get_attribute("href") for link in driver.find_elements(By.TAG_NAME, "a")]

    # Quit the WebDriver
    driver.quit()

    return all_links

# Example usage
url_to_scrape = "https://keskustelut.inderes.fi/c/osakkeet"
all_links_multiple_pages = scrape_links_multiple_pages(url_to_scrape)


In [6]:
# Print the collected links
for link in all_links_multiple_pages:
    print(link)

https://www.inderes.fi/
https://www.inderes.fi/
https://www.inderes.fi/inderes-mallisalkku
https://www.inderes.fi/inderestv
https://www.inderes.fi/osakevertailu
https://keskustelut.inderes.fi/c/osakkeet/17#main-container
https://keskustelut.inderes.fi/
https://keskustelut.inderes.fi/c/osakkeet/17/l/latest
https://keskustelut.inderes.fi/t/seuraava-tuloslive-withsecure-q124-keskiviikkona-24-4-klo-7-55/49030
https://keskustelut.inderes.fi/c/osakkeet/17
https://keskustelut.inderes.fi/tag/live
https://keskustelut.inderes.fi/tag/tuloskausi
https://keskustelut.inderes.fi/tag/tuloslive
https://keskustelut.inderes.fi/t/seuraava-tuloslive-withsecure-q124-keskiviikkona-24-4-klo-7-55/49030
https://keskustelut.inderes.fi/u/tomi_valkeajarvi
https://keskustelut.inderes.fi/t/seuraava-tuloslive-withsecure-q124-keskiviikkona-24-4-klo-7-55/49030/3
https://keskustelut.inderes.fi/t/alueesta-osakkeet/67
https://keskustelut.inderes.fi/c/osakkeet/17
https://keskustelut.inderes.fi/t/alueesta-osakkeet/67
https:

In [7]:
# Initialize an empty set to store unique base URLs
unique_base_urls = set()

# Initialize an empty list to store duplicate URLs
duplicate_urls = []

# Print the collected links after removing numbers and slashes at the end
for link in all_links_multiple_pages:
    cleaned_link = re.sub(r'/\d+$', '', link)
    if cleaned_link.startswith('https://keskustelut.inderes.fi/t/'):
        # Extract the base URL
        base_url = '/'.join(cleaned_link.split('/')[:-1])
        
        # Check if the base URL is already in the set
        if base_url in unique_base_urls:
            # If duplicate, add to the duplicate list
            duplicate_urls.append(cleaned_link)
        else:
            # If unique, add to the set
            unique_base_urls.add(base_url)

# Print the duplicate URLs
if duplicate_urls:
    print("Duplicate URLs:")
    for url in duplicate_urls:
        print(url)
else:
    print("No duplicate URLs found.")



Duplicate URLs:
https://keskustelut.inderes.fi/t/seuraava-tuloslive-withsecure-q124-keskiviikkona-24-4-klo-7-55
https://keskustelut.inderes.fi/t/alueesta-osakkeet
https://keskustelut.inderes.fi/t/alueesta-osakkeet
https://keskustelut.inderes.fi/t/tesla-johtava-tulevaisuuden-autovalmistaja-osa-2
https://keskustelut.inderes.fi/t/cloudflare-cdn-aa-ja-taistelua-palvelunestohyokkayksia-vastaan
https://keskustelut.inderes.fi/t/talenom-automatisoiduilla-prosesseilla-tehokkaammaksi
https://keskustelut.inderes.fi/t/ostin-myin-juuri-asken-osa-6
https://keskustelut.inderes.fi/t/nokia-sijoituskohteena-osa-3
https://keskustelut.inderes.fi/t/capman-onko-varoja-mista-jakaa
https://keskustelut.inderes.fi/t/lvmh-luksusta-salkkuun
https://keskustelut.inderes.fi/t/porssien-suunta-osa-3
https://keskustelut.inderes.fi/t/nordea-pohjoismainen-pankkipriimus
https://keskustelut.inderes.fi/t/neste-ilmastonmuutostaistelun-eturivissa
https://keskustelut.inderes.fi/t/titanium-kasvun-toista-tukijalkaa-etsimassa
htt

In [8]:
for items in unique_base_urls:
    print(items)

https://keskustelut.inderes.fi/t/onnistumiset-sijoittamisessa
https://keskustelut.inderes.fi/t/macys-inc-tavarataloketju-arvosijoittajalle
https://keskustelut.inderes.fi/t/suomalaisten-porssiyhtioiden-tunnusluvut
https://keskustelut.inderes.fi/t/nattopharma-johtava-k2-valmistaja
https://keskustelut.inderes.fi/t/lindex-group-stockmann-group
https://keskustelut.inderes.fi/t/genovis-ab-biotieteiden-tyovalineet-ja-palvelut
https://keskustelut.inderes.fi/t/cadeler-a-s-tuulivoimaloiden-asennusta-merella
https://keskustelut.inderes.fi/t/autostore-norjalainen-varastoinnin-pioneeri-listautumassa
https://keskustelut.inderes.fi/t/fractal-gaming-group-ab
https://keskustelut.inderes.fi/t/smile-hymya-myos-sijoittajalle
https://keskustelut.inderes.fi/t/exel-composites
https://keskustelut.inderes.fi/t/satellos-biosciences-rebuilding-muscle-from-within
https://keskustelut.inderes.fi/t/tesla-johtava-tulevaisuuden-autovalmistaja-osa-2
https://keskustelut.inderes.fi/t/kemira-globaali-kemianyhtio
https://k

In [9]:
print(len(unique_base_urls))

789


In [10]:

data_df = pd.DataFrame(columns=['URL', 'Created At', 'Last Reply', 'Visits', 'Replies', 'Users', 'Likes', 'Comments Details'])


In [11]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   URL               0 non-null      object
 1   Created At        0 non-null      object
 2   Last Reply        0 non-null      object
 3   Visits            0 non-null      object
 4   Replies           0 non-null      object
 5   Users             0 non-null      object
 6   Likes             0 non-null      object
 7   Comments Details  0 non-null      object
dtypes: object(8)
memory usage: 0.0+ bytes


In [12]:
for link in unique_base_urls:
    if link not in data_df['URL'].tolist():
        print(f"Processing: {link}")
        
        # Get HTML code and extract statistics
        html_code = get_html_code_of_given_url(link)
        data_df2 = get_statistics_from_website(html_code, data_df, link)

        # Concatenate data_df2 into data_df
        data_df = pd.concat([data_df, data_df2], ignore_index=True)

        # Set index to 'URL' to avoid duplicates
        data_df = data_df.drop_duplicates(subset=['URL'], keep='first').reset_index(drop=True)

        # Print the updated DataFrame's information
        data_df.info()


Processing: https://keskustelut.inderes.fi/t/onnistumiset-sijoittamisessa
Processed URL: https://keskustelut.inderes.fi/t/onnistumiset-sijoittamisessa
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   URL               1 non-null      object
 1   Created At        1 non-null      object
 2   Last Reply        1 non-null      object
 3   Visits            1 non-null      object
 4   Replies           1 non-null      object
 5   Users             1 non-null      object
 6   Likes             1 non-null      object
 7   Comments Details  1 non-null      object
dtypes: object(8)
memory usage: 192.0+ bytes
Processing: https://keskustelut.inderes.fi/t/macys-inc-tavarataloketju-arvosijoittajalle
Processed URL: https://keskustelut.inderes.fi/t/macys-inc-tavarataloketju-arvosijoittajalle
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 

In [13]:
data_df.tail()

Unnamed: 0,URL,Created At,Last Reply,Visits,Replies,Users,Likes,Comments Details
784,https://keskustelut.inderes.fi/t/dermtech-ihom...,tammi 2021,marras 2021,"8,4 k",48,11,105,"[{'comment': 'Avataampa ketju DermTechille, mi..."
785,https://keskustelut.inderes.fi/t/supermicro-sm...,marras 2022,16 t,"28,4 k",152,33,805,"[{'comment': 'Supermicro on perustettu 1993, s..."
786,https://keskustelut.inderes.fi/t/terrafame-pal...,maalis 2022,17 pv,"11,6 k",17,5,225,[{'comment': 'Hieman yli kymmenen vuotta sitte...
787,https://keskustelut.inderes.fi/t/embracer-grou...,marras 2020,4 pv,302 k,"1,3 k",185,"11,8 k",[{'comment': 'Embracer group konsernilla on ka...
788,https://keskustelut.inderes.fi/t/investor-ab-w...,huhti 2020,22. maalis,"68,2 k",221,58,"1,4 k",[{'comment': 'Wallenbergin suvun sijoitusyhtiö...


In [15]:
data_df.to_pickle('forum_data_scraped2')

In [16]:
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")

Elapsed time: 25314.79336285591 seconds
