In [None]:
"""
This script scrapes links from multiple pages of a website, filters and processes the links, and saves statistical data into separate CSV files.

1. Scraping Links:
   - The script uses Selenium WebDriver to scrape links from multiple pages of the website.
   - It scrolls down each page to load dynamic content and collects all links.
   - The collected links are filtered to include only those starting with a specified prefix.

2. Processing Links:
   - Unique base links are extracted from the filtered links.
   - The script splits a list of links into two halves and processes each half separately.
   - For each link in the first half:
     - If the link does not exist in the DataFrame 'data_df', it fetches HTML code, extracts statistical data, and appends it to the DataFrame.
   - For each link in the second half:
     - If the link does not exist in the DataFrame 'data_df', it follows the same process as above.

3. Saving Data:
   - After processing, the statistical data is saved into separate CSV files ('my_datadf500.csv' and 'my_datadf600.csv').
   - Before saving, the script checks if the file already exists. If not, it saves the data; otherwise, it skips the save operation.

Note: 
- Ensure the ChromeDriver executable is placed in the correct path for Selenium to work.
- Adjust the number of pages to scrape and other parameters as needed.
"""
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
import time
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service

url = 'https://keskustelut.inderes.fi/'
html_doc = requests.get(url)
list_of_links=[]
soup = BeautifulSoup(html_doc.content, 'html.parser')
#print(soup.prettify())
links = soup.find_all('a')
for link in links:
    list_of_links.append(link.get('href'))
    print(link.get('href'))

In [None]:
filtered_links = [link for link in list_of_links if link.startswith('https')]
filtered_links = filtered_links[2:-1]
for link in filtered_links:
    print(link)

In [None]:
def extract_links_from_url(url):
    """
    Extracts all links from a given URL.

    Args:
    url (str): The URL to scrape for links.

    Returns:
    list: A list of all extracted links.
    """
    # Send a GET request to the URL and store the response
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    # Find all 'a' tags and extract the 'href' attribute
    all_links = [link.get("href") for link in soup.find_all("a", href=True)]
    return all_links

# Set to store all scraped links
scraped_links = set()

# Loop through each filtered link and extract links from their pages
for link in filtered_links:
    extracted_links = extract_links_from_url(link)
    scraped_links.update(extracted_links)

# Print all scraped links
for scraped_link in scraped_links:
    print(scraped_link)

In [None]:
# Filter links from scraped_links that start with 'https'
filtered_links2 = [link for link in scraped_links if link.startswith('https')]
filtered_links2 = filtered_links2[2:-1]
for link in filtered_links2:
    print(link)


In [None]:
linkkien_joukko = (filtered_links2 + filtered_links)


In [None]:
for links in linkkien_joukko:
    print(links)

In [None]:
filtered_set = set()

# Add links starting with 'https://keskustelut.inderes.fi' to the new set
for link in linkkien_joukko:
    if link.startswith('https://keskustelut.inderes.fi'):
        filtered_set.add(link)

# Print the filtered set
for link in filtered_set:
    print(link)

In [None]:
def get_html_code_of_given_url(url):
    """
    Gets the HTML code of a given URL using Selenium WebDriver.

    Args:
    url (str): The URL to fetch the HTML code from.

    Returns:
    str: The HTML code of the URL.
    """
    # Path to your ChromeDriver executable
    chrome_driver_path = "C:\\Users\\mehta\\OneDrive\\Työpöytä\\chromedriver.exe"



    # Set up Chrome options (optional)
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')  # Run Chrome in headless mode (without opening the browser window)

    # Create a webdriver instance
    service = Service(chrome_driver_path)
    driver = webdriver.Chrome(service=service, options=chrome_options)

    # Navigate to the URL
    driver.get(url)

    # Scroll to the bottom of the page (or perform the scrolling logic you need)
    # Here, it scrolls 30 times with a small delay to load more content
    for i in range(30):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Adjust the delay as needed

    html = driver.page_source

    # Close the browser window
    driver.quit()
    return html


In [None]:


def get_statistics_from_website(html,data_df,url):
    """
    Extracts statistical data from the HTML code of a website.

    Args:
    html (str): The HTML code of the website.
    data_df (pd.DataFrame): The DataFrame to store the extracted data.
    url (str): The URL of the website.

    Returns:
    pd.DataFrame: The updated DataFrame with the extracted data.
    """
    soup = BeautifulSoup(html, 'html.parser')
    
    number_of_visits = None
    created_at = None
    replies_count = None
    last_reply = None
    users_count = None  # Initialize with default value
    likes_count = None  # Initialize with default value
    links_count = None  # Initialize with default value
    comments = None
    title = None
    visit_element = soup.find('li', class_='secondary views')
    if visit_element:
        # Find the <span> element within <li> and extract the title attribute
        span_element = visit_element.find('span', class_='number')
        if span_element and 'title' in span_element.attrs:
            number_of_visits = span_element['title']
            print(f"Number of visits: {number_of_visits}")
        else:
            print("Number of visits not found in the HTML content.")
    else:
        print("Element with class 'secondary views' not found in the HTML content.")
    created_at_element = soup.find('li', class_='created-at')
    if created_at_element:
        span_element = created_at_element.find('div', class_='topic-map-post created-at')
        if span_element:
            created_at = span_element.text.strip()
            print(f"Created at: {created_at}")
        else:
            print("Created-at information not found in the HTML content.")
    else:
        print("Element with class 'created-at' not found in the HTML content.")
        created_at = ""
        # Find the element for replies
    replies_element = soup.find('li', class_='replies')
    if replies_element:
        span_element = replies_element.find('span', class_='number')
        if span_element:
            replies_count = span_element.text.strip()
            print(f"Replies: {replies_count}")
        else:
            print("Replies information not found in the HTML content.")
    else:
        print("Element with class 'replies' not found in the HTML content.")
        # Find the element for last-reply
    last_reply_element = soup.find('li', class_='last-reply')
    if last_reply_element:
        div_element = last_reply_element.find('div', class_='topic-map-post last-reply')
        if div_element:
            last_reply = div_element.text.strip()
            print(f"Last Reply: {last_reply}")
        else:
            print("Last-reply information not found in the HTML content.")
    else:
        print("Element with class 'last-reply' not found in the HTML content.")
    users_element = soup.find('li', class_='secondary users')
    if users_element:
        span_element = users_element.find('span', class_='number')
        if span_element:
            users_count = span_element.text.strip()
            print(f"Users: {users_count}")
        else:
            print("Users information not found in the HTML content.")
    else:
        print("Element with class 'secondary users' not found in the HTML content.")

    likes_element = soup.find('li', class_='secondary likes')
    if likes_element:
        span_element = likes_element.find('span', class_='number')
        if span_element:
            likes_count = span_element.text.strip()
            print(f"Likes: {likes_count}")
        else:
            print("Likes information not found in the HTML content.")
    else:
        print("Element with class 'secondary likes' not found in the HTML content.")

    links_element = soup.find('li', class_='secondary links')
    if links_element:
        span_element = links_element.find('span', class_='number')
        if span_element:
            links_count = span_element.text.strip()
            print(f"Links: {links_count}")
        else:
            print("Links information not found in the HTML content.")
    else:
        print("Element with class 'secondary links' not found in the HTML content.")

    posts = soup.find_all('article', class_='boxed onscreen-post')
    all_posts_comments = []

    for post in posts:
        try:
            post_id = post.get('id')
            comments = post.find('div', class_='regular contents')
            post_comments_details = []

            if comments:
                for comment in comments.find_all('p'):
                    try:
                        
                        timestamp = None
                        post_div = comment.find_parent('article', class_='boxed onscreen-post')
                        if post_div:
                            post_info_div = post_div.find('div', class_='post-infos')
                            if post_info_div:
                                timestamp_element = post_info_div.find('span', class_='relative-date')
                                if timestamp_element and 'title' in timestamp_element.attrs:
                                    timestamp = timestamp_element['title']

                        # Find likes
                        likes = 0
                        likes_element = comments.find('button', class_='widget-button btn-flat button-count like-count highlight-action regular-likes btn-icon-text')
                        if likes_element:
                            match = re.match(r'^\d+', likes_element.text)
                            if match:
                                likes = int(match.group(0))

                        comment_text = comment.text.strip()
                        post_comments_details.append({
                            "post_id": post_id,
                            "comment": comment_text,
                            "timestamp": timestamp,
                            "likes": likes
                        })
                    except Exception as e:
                        print(f"Error processing comment in post {post_id}: {e}")

            all_posts_comments.extend(post_comments_details)
        except Exception as e:
            print(f"Error processing post {post_id}: {e}")
            
    # Create a new DataFrame with the collected data
    new_data = pd.DataFrame({
        'URL': [url],
        'Title': [title],
        'Created At': [created_at],
        'Last Reply': [last_reply],
        'Visits': [number_of_visits],
        'Replies': [replies_count],
        'Users': [users_count],
        'Likes': [likes_count],
        'Links': [links_count],
        'Comments Details': [all_posts_comments]
    })

    # Append the new data to the existing DataFrame
    data_df = pd.concat([data_df, new_data], ignore_index=True)

    print(f"Processed URL: {url}\n")
    return data_df


In [None]:

def scrape_links_multiple_pages(url, num_pages=40):
    """
    Scrape links from multiple pages by scrolling down and collecting links.

    Args:
    url (str): The URL of the page to start scraping from.
    num_pages (int): The number of pages to scrape.

    Returns:
    list: A list of links collected from multiple pages.
    """
    # Initialize Chrome webdriver
    driver = webdriver.Chrome()
    driver.get(url)

    # Scroll and wait for dynamic content to load multiple times
    for _ in range(num_pages):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Adjust sleep time as needed

        # Wait for dynamic content to load
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "a")))

    # Collect all links on the page
    all_links = [link.get_attribute("href") for link in driver.find_elements(By.TAG_NAME, "a")]

    # Quit the webdriver
    driver.quit()

    return all_links

# Replace this with the URL you want to scrape
url_to_scrape = "https://keskustelut.inderes.fi"
all_links_multiple_pages = scrape_links_multiple_pages(url_to_scrape)

# Print the collected links
for link in all_links_multiple_pages:
    print(link)


In [None]:
# Filter links that start with the specified prefix
filtered_links = [link for link in all_links_multiple_pages if link.startswith("https://keskustelut.inderes.fi/t/")]

# Print the filtered links
for link in filtered_links:
    print(link)


In [None]:
# Initialize an empty set to store unique base links
unique_links = set()

# Extract unique base links from filtered_links
for link in filtered_links:
    # Extract the base link by splitting the URL at the last '/'
    base_link = link.rsplit('/', 1)[0]
    
    # Ensure the base link conforms to the expected pattern of discussion threads
    if base_link.startswith('https://keskustelut.inderes.fi/t/'):
        # Add the base link to the set
        unique_links.add(base_link)

# Print the unique base links
for link in unique_links:
    print(link)


In [None]:
# Assuming list_of_links is your list
midpoint = len(unique_links) // 2  # Integer division to find the midpoint

# Split the list
unique_links = list(unique_links)
first_half = unique_links[:midpoint]
second_half = unique_links[midpoint:]
for link in first_half:
    print(link)

In [None]:
print(len(second_half))
data_df = pd.DataFrame(columns=['URL', 'Title', 'Created At', 'Last Reply', 'Visits', 'Replies', 'Users', 'Likes', 'Links', 'Comments Details'])

In [None]:
for link in first_half: 
    if link not in data_df['URL'].values:
        print(link)
        html_code = get_html_code_of_given_url(link)
        data_df = get_statistics_from_website(html_code, data_df, link)
        data_df.info()


In [None]:
data_df.to_csv('my_datadf5.csv', index=False)

In [None]:
for link in second_half:
    if link not in data_df['URL'].values:
        print(link)
        html_code = get_html_code_of_given_url(link)
        data_df2 = get_statistics_from_website(html_code, data_df, link)
        data_df2.info()


In [None]:
data_df2.to_csv('my_datadf6.csv', index=False)