In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
import re
from datetime import datetime

## Analyse Linkedin data
def login_to_linkedin(driver, username, password):
    driver.get("https://www.linkedin.com/login")
    driver.find_element(By.ID, "username").send_keys(username)
    driver.find_element(By.ID, "password").send_keys(password)
    driver.find_element(By.CSS_SELECTOR, "button[type='submit']").click()
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "div.feed-shared-update-v2"))
    )

def scrape_posts(driver, account_url, num_posts=100):
    driver.get(account_url + "/posts/")
    posts = []
    seen_posts = set()
    last_height = driver.execute_script("return document.body.scrollHeight")

    while len(posts) < num_posts:
        # Scroll down to load more posts
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Wait for new content to load

        # Extract posts
        elements = driver.find_elements(By.CSS_SELECTOR, "div.feed-shared-update-v2")
        for element in elements:
            if len(posts) >= num_posts:
                break
            try:
                text_element = element.find_element(By.CSS_SELECTOR, "span.break-words")
                text = text_element.text
                if text not in seen_posts:
                    posts.append({"text": text})
                    seen_posts.add(text)
                    print(f"Post found: {text}")  # Logging the post content
            except Exception as e:
                print(f"Error extracting post: {e}")  # Logging the error
                continue

        # Check if we've reached the end of the page
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    return posts

def save_to_csv(posts, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=["text"])
        writer.writeheader()
        for post in posts:
            writer.writerow(post)

# Main workflow
username = "william.masquelier93@gmail.com"
password = "6868Alexis"
account_url = "https://www.linkedin.com/company/iluka-resources/"

driver = webdriver.Chrome()  # Make sure you have ChromeDriver installed and in PATH
try:
    login_to_linkedin(driver, username, password)
    posts = scrape_posts(driver, account_url, num_posts=100)
    save_to_csv(posts, "linkedin_posts.csv")
finally:
    driver.quit()

print("Scraping completed. Check linkedin_posts.csv for results.")

Post found: Iluka Resources employees from the Perth office have pedalled for a purpose, raising funds for Starfish Nippers, Steve Waugh Foundation and Give A Feed.

They formed two teams to ride stationary bikes for 20 minutes in the recent 10th Charity Spin Roadshow organised by corporate fitness provider Motivate Ultra.

Participants from companies across nine office buildings within the Perth CBD helped to raise a total of $65,000.

While Iluka’s teams missed out on prizes for total kilometres cycled, they were thrilled to be recognised for their music playlists as well as their creative outfits.
Post found: We've been a proud partner of Foodbank WA since 2020, enabling the Geraldton branch to materially increase its food assistance and related services in Western Australia's Mid West region. In Perth, our people are volunteering in Foodbank's kitchen and warehouse facilities, providing support in the lead-up to Christmas.
Post found: Iluka Resources was proud to be a Silver Sponso

In [12]:
posts

[{'text': 'Iluka Resources employees from the Perth office have pedalled for a purpose, raising funds for Starfish Nippers, Steve Waugh Foundation and Give A Feed.\n\nThey formed two teams to ride stationary bikes for 20 minutes in the recent 10th Charity Spin Roadshow organised by corporate fitness provider Motivate Ultra.\n\nParticipants from companies across nine office buildings within the Perth CBD helped to raise a total of $65,000.\n\nWhile Iluka’s teams missed out on prizes for total kilometres cycled, they were thrilled to be recognised for their music playlists as well as their creative outfits.'},
 {'text': "We've been a proud partner of Foodbank WA since 2020, enabling the Geraldton branch to materially increase its food assistance and related services in Western Australia's Mid West region. In Perth, our people are volunteering in Foodbank's kitchen and warehouse facilities, providing support in the lead-up to Christmas."},
 {'text': 'Iluka Resources was proud to be a Silv

In [15]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
import re
from datetime import datetime

def login_to_linkedin(driver, username, password):
    driver.get("https://www.linkedin.com/login")
    driver.find_element(By.ID, "username").send_keys(username)
    driver.find_element(By.ID, "password").send_keys(password)
    driver.find_element(By.CSS_SELECTOR, "button[type='submit']").click()
    
    # Wait for feed to load, indicating successful login
    WebDriverWait(driver, 15).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "div.feed-shared-update-v2"))
    )
    print("Login successful")

def parse_date(date_text):
    """Convert LinkedIn date format to a standard format"""
    try:
        if "h" in date_text or "m" in date_text:
            # Today's post
            return datetime.now().strftime("%Y-%m-%d")
        elif "d" in date_text:
            # Recent days
            days = int(re.search(r'(\d+)d', date_text).group(1))
            date = datetime.now()
            return (date.replace(day=date.day-days)).strftime("%Y-%m-%d")
        elif "w" in date_text:
            # Recent weeks
            weeks = int(re.search(r'(\d+)w', date_text).group(1))
            date = datetime.now()
            return (date.replace(day=date.day-(weeks*7))).strftime("%Y-%m-%d")
        else:
            # Try to parse specific date formats
            try:
                return datetime.strptime(date_text, "%b %d, %Y").strftime("%Y-%m-%d")
            except:
                return date_text  # Return as is if can't parse
    except Exception as e:
        print(f"Error parsing date '{date_text}': {e}")
        return date_text

def parse_reaction_count(count_text):
    """Convert LinkedIn reaction counts to integers"""
    if not count_text:
        return 0
    
    count_text = count_text.strip()
    if 'K' in count_text:
        return int(float(count_text.replace('K', '')) * 1000)
    if 'M' in count_text:
        return int(float(count_text.replace('M', '')) * 1000000)
    
    try:
        return int(count_text)
    except:
        return 0

def extract_post_details(post_element):
    """Extract all available details from a LinkedIn post with more flexible selectors"""
    post_data = {}
    
    try:
        # Extract text content with multiple selector attempts
        try:
            # Try several possible selectors for post text
            text_selectors = [
                "span.break-words", 
                "div.feed-shared-update-v2__description-wrapper", 
                "div.feed-shared-text", 
                "div.feed-shared-text > span"
            ]
            
            post_text = ""
            for selector in text_selectors:
                text_elements = post_element.find_elements(By.CSS_SELECTOR, selector)
                if text_elements:
                    post_text = text_elements[0].text
                    if post_text.strip():
                        break
            
            post_data["text"] = post_text
        except Exception as e:
            print(f"Error extracting post text: {e}")
            post_data["text"] = ""
        
        # Extract post date
        try:
            date_element = post_element.find_element(By.CSS_SELECTOR, "span.visually-hidden")
            post_date_text = date_element.text.replace("Posted: ", "")
            post_data["date"] = parse_date(post_date_text)
        except Exception as e:
            print(f"Error extracting post date: {e}")
            post_data["date"] = ""
        
        # Extract reaction counts (likes, comments, shares)
        try:
            # Total reactions
            reaction_elements = post_element.find_elements(By.CSS_SELECTOR, "span.social-details-social-counts__reactions-count")
            if reaction_elements:
                post_data["reactions"] = parse_reaction_count(reaction_elements[0].text)
            else:
                post_data["reactions"] = 0
                
            # Comments count
            comment_elements = post_element.find_elements(By.CSS_SELECTOR, ".social-details-social-counts__comments-count")
            if comment_elements:
                post_data["comments_count"] = parse_reaction_count(comment_elements[0].text)
            else:
                post_data["comments_count"] = 0
                
            # Shares count
            share_elements = post_element.find_elements(By.CSS_SELECTOR, ".social-details-social-counts__shares-count")
            if share_elements:
                post_data["shares_count"] = parse_reaction_count(share_elements[0].text)
            else:
                post_data["shares_count"] = 0
        except Exception as e:
            print(f"Error extracting reaction counts: {e}")
            post_data["reactions"] = 0
            post_data["comments_count"] = 0
            post_data["shares_count"] = 0
        
        # Extract author/poster information
        try:
            author_element = post_element.find_element(By.CSS_SELECTOR, ".feed-shared-actor__name")
            post_data["author"] = author_element.text
        except Exception as e:
            print(f"Error extracting author: {e}")
            post_data["author"] = ""
            
        # Extract post type (article, image, video, etc.)
        try:
            if post_element.find_elements(By.CSS_SELECTOR, ".feed-shared-article"):
                post_data["post_type"] = "article"
            elif post_element.find_elements(By.CSS_SELECTOR, ".feed-shared-image"):
                post_data["post_type"] = "image"
            elif post_element.find_elements(By.CSS_SELECTOR, ".feed-shared-video"):
                post_data["post_type"] = "video"
            elif post_element.find_elements(By.CSS_SELECTOR, ".feed-shared-external-video"):
                post_data["post_type"] = "external-video"
            elif post_element.find_elements(By.CSS_SELECTOR, ".feed-shared-poll"):
                post_data["post_type"] = "poll"
            else:
                post_data["post_type"] = "text"
        except Exception as e:
            print(f"Error determining post type: {e}")
            post_data["post_type"] = "unknown"
            
        # Extract URL if it's a shared article
        try:
            url_elements = post_element.find_elements(By.CSS_SELECTOR, ".feed-shared-article__link-container a")
            if url_elements:
                post_data["shared_url"] = url_elements[0].get_attribute("href")
            else:
                post_data["shared_url"] = ""
        except Exception as e:
            print(f"Error extracting shared URL: {e}")
            post_data["shared_url"] = ""
            
        # Extract post ID for future reference
        try:
            post_data["post_id"] = post_element.get_attribute("data-urn") or ""
        except Exception as e:
            print(f"Error extracting post ID: {e}")
            post_data["post_id"] = ""
        
        # Extract post comments if available 
        try:
            # First check if comments are expanded, if not try to expand them
            comments_section = post_element.find_elements(By.CSS_SELECTOR, ".comments-comments-list")
            
            # If comments aren't visible yet but there are some, try to click to expand
            if not comments_section and post_data["comments_count"] > 0:
                try:
                    comments_button = post_element.find_element(By.CSS_SELECTOR, ".social-details-social-counts__comments")
                    driver.execute_script("arguments[0].click();", comments_button)
                    time.sleep(1.5)  # Wait for comments to load
                    comments_section = post_element.find_elements(By.CSS_SELECTOR, ".comments-comments-list")
                except:
                    pass
                    
            if comments_section:
                comments = []
                comment_elements = post_element.find_elements(By.CSS_SELECTOR, ".comments-comment-item")
                for comment in comment_elements[:5]:  # Limit to first 5 comments to avoid excessive scraping
                    try:
                        comment_author = comment.find_element(By.CSS_SELECTOR, ".comments-post-meta__name-text").text
                        comment_text = comment.find_element(By.CSS_SELECTOR, ".comments-comment-item__main-content").text
                        comments.append({
                            "author": comment_author,
                            "text": comment_text
                        })
                    except Exception as e:
                        print(f"Error extracting individual comment: {e}")
                
                post_data["comments"] = comments
            else:
                post_data["comments"] = []
        except Exception as e:
            print(f"Error extracting comments: {e}")
            post_data["comments"] = []
            
        return post_data
        
    except Exception as e:
        print(f"Error processing post: {e}")
        return {"text": "", "error": str(e)}

def scrape_posts(driver, account_url, num_posts=100):
    driver.get(account_url + "/posts/")
    time.sleep(5)  # Give more time for initial page load
    
    posts = []
    seen_post_ids = set()
    last_height = driver.execute_script("return document.body.scrollHeight")
    scroll_attempts = 0
    max_scroll_attempts = 20  # Increase max scroll attempts
    
    # A more comprehensive selector to catch all post types
    post_selectors = [
        "div.feed-shared-update-v2", 
        "div.occludable-update",
        "div.feed-container-theme div[data-urn]"
    ]
    
    while len(posts) < num_posts and scroll_attempts < max_scroll_attempts:
        # Scroll down to load more posts
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)  # Increased wait time for content to load
        
        # Try finding posts with different selectors
        elements = []
        for selector in post_selectors:
            elements.extend(driver.find_elements(By.CSS_SELECTOR, selector))
        
        # Process each post element
        for element in elements:
            if len(posts) >= num_posts:
                break
                
            # Get unique identifier for the post with more fallbacks
            try:
                # Try several methods to get a unique ID
                post_id = element.get_attribute("data-urn")
                
                if not post_id:
                    post_id = element.get_attribute("data-id")
                
                if not post_id:
                    # Get a small sample of text as ID
                    try:
                        text_elements = element.find_elements(By.CSS_SELECTOR, "span.break-words")
                        if text_elements and text_elements[0].text:
                            post_id = text_elements[0].text[:50]  # First 50 chars
                    except:
                        pass
                        
                if not post_id:
                    # Last resort - use a hash of the element's outer HTML
                    post_id = str(hash(element.get_attribute("outerHTML")[:100]))
                
            except:
                # Skip to next element if we can't ID it
                continue
                
            # Check if we've seen this post before using a more lenient approach
            # Only use the exact ID match if we have a proper ID
            if post_id not in seen_post_ids:
                post_details = extract_post_details(element)
                
                # Only add posts that have some useful content
                if post_details.get("text") and len(post_details.get("text")) > 10:
                    posts.append(post_details)
                    seen_post_ids.add(post_id)
                    print(f"Post #{len(posts)} extracted: {post_details.get('text')[:50]}...")
        
        # Check if we've reached the end of the page
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            # If no new content, try clicking "Show more" buttons if they exist
            try:
                show_more_buttons = driver.find_elements(By.XPATH, "//button[contains(., 'Show more')]")
                if show_more_buttons:
                    for button in show_more_buttons:
                        driver.execute_script("arguments[0].click();", button)
                    time.sleep(2)
                    scroll_attempts = 0  # Reset scroll attempts if we found buttons
                else:
                    scroll_attempts += 1
            except:
                scroll_attempts += 1
            
            print(f"Scroll attempt {scroll_attempts}/{max_scroll_attempts}")
            time.sleep(1.5)
        else:
            scroll_attempts = 0
            
        last_height = new_height
        
    return posts

def save_to_csv(posts, filename):
    if not posts:
        print("No posts to save")
        return
        
    # Get all possible fields from all posts
    fieldnames = set()
    for post in posts:
        for key in post.keys():
            if key != "comments":  # Handle comments separately
                fieldnames.add(key)
    
    fieldnames = sorted(list(fieldnames))  # Sort field names for consistent output
    
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames + ["comments"])
        writer.writeheader()
        
        for post in posts:
            # Convert comments list to string representation for CSV
            if "comments" in post and post["comments"]:
                post["comments"] = str(post["comments"])
            writer.writerow(post)
    
    print(f"Saved {len(posts)} posts to {filename}")

def save_to_json(posts, filename):
    import json
    with open(filename, 'w', encoding='utf-8') as file:
        json.dump(posts, file, ensure_ascii=False, indent=2)
    print(f"Saved {len(posts)} posts to {filename} in JSON format")

# Main workflow
username = "william.masquelier93@gmail.com"
password = "6868Alexis"
account_url = "https://www.linkedin.com/company/iluka-resources/"

# Configure webdriver with options for better stability
options = webdriver.ChromeOptions()
options.add_argument("--disable-notifications")  # Disable notifications
options.add_argument("--start-maximized")  # Start maximized
options.add_argument("--disable-extensions")  # Disable extensions

driver = webdriver.Chrome(options=options)

try:
    login_to_linkedin(driver, username, password)
    posts = scrape_posts(driver, account_url, num_posts=100)
    
    # Save in both CSV and JSON formats
    save_to_csv(posts, "linkedin_posts.csv")
    save_to_json(posts, "linkedin_posts.json")  # JSON preserves nested data better
    
    print(f"Successfully extracted {len(posts)} posts with detailed information")
    
finally:
    driver.quit()

print("Scraping completed.")

Login successful
Error extracting author: Message: no such element: Unable to locate element: {"method":"css selector","selector":".feed-shared-actor__name"}
  (Session info: chrome=133.0.6943.141); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x000000010d405708 chromedriver + 5969672
1   chromedriver                        0x000000010d3fd32a chromedriver + 5935914
2   chromedriver                        0x000000010ceb9650 chromedriver + 415312
3   chromedriver                        0x000000010cf0ae94 chromedriver + 749204
4   chromedriver                        0x000000010cf0b041 chromedriver + 749633
5   chromedriver                        0x000000010cefe8c6 chromedriver + 698566
6   chromedriver                        0x000000010cf3119d chromedriver + 905629
7   chromedriver                        0x000000010cefe7b8 chromedriver + 6

In [16]:
posts

[{'text': 'Iluka Resources employees from the Perth office have pedalled for a purpose, raising funds for Starfish Nippers, Steve Waugh Foundation and Give A Feed.\n\nThey formed two teams to ride stationary bikes for 20 minutes in the recent 10th Charity Spin Roadshow organised by corporate fitness provider Motivate Ultra.\n\nParticipants from companies across nine office buildings within the Perth CBD helped to raise a total of $65,000.\n\nWhile Iluka’s teams missed out on prizes for total kilometres cycled, they were thrilled to be recognised for their music playlists as well as their creative outfits.',
  'date': 'Iluka Resources',
  'reactions': 270,
  'comments_count': 0,
  'shares_count': 0,
  'author': '',
  'post_type': 'text',
  'shared_url': '',
  'post_id': 'urn:li:activity:7274239859693645825',
  'comments': []},
 {'text': "We've been a proud partner of Foodbank WA since 2020, enabling the Geraldton branch to materially increase its food assistance and related services in 

In [17]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
import re
import json
from datetime import datetime, timedelta

def login_to_linkedin(driver, username, password):
    driver.get("https://www.linkedin.com/login")
    driver.find_element(By.ID, "username").send_keys(username)
    driver.find_element(By.ID, "password").send_keys(password)
    driver.find_element(By.CSS_SELECTOR, "button[type='submit']").click()
    
    # Wait for feed to load, indicating successful login
    WebDriverWait(driver, 15).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "div.feed-shared-update-v2"))
    )
    print("Login successful")

def parse_date(date_text):
    """Convert LinkedIn date format to a standard format"""
    try:
        # Remove any "Posted: " prefix if present
        date_text = date_text.replace("Posted: ", "").strip()
        
        today = datetime.now()
        
        if "h" in date_text:
            # Hours ago (today)
            return today.strftime("%Y-%m-%d")
        elif "m" in date_text:
            # Minutes ago (today)
            return today.strftime("%Y-%m-%d")
        elif "d" in date_text:
            # Recent days
            days_ago = int(re.search(r'(\d+)d', date_text).group(1))
            return (today - timedelta(days=days_ago)).strftime("%Y-%m-%d")
        elif "w" in date_text:
            # Recent weeks
            weeks_ago = int(re.search(r'(\d+)w', date_text).group(1))
            return (today - timedelta(weeks=weeks_ago)).strftime("%Y-%m-%d")
        elif "mo" in date_text:
            # Recent months
            months_ago = int(re.search(r'(\d+)mo', date_text).group(1))
            # Approximating a month as 30 days
            return (today - timedelta(days=months_ago*30)).strftime("%Y-%m-%d")
        else:
            # Try to parse specific date formats
            for fmt in ["%b %d, %Y", "%B %d, %Y", "%d %b %Y"]:
                try:
                    return datetime.strptime(date_text, fmt).strftime("%Y-%m-%d")
                except:
                    continue
            return date_text  # Return as is if can't parse
    except Exception as e:
        print(f"Error parsing date '{date_text}': {e}")
        return ""

def parse_count(count_text):
    """Convert LinkedIn count text to integers"""
    if not count_text:
        return 0
    
    count_text = count_text.strip()
    
    # Remove any non-numeric parts
    count_text = re.sub(r'[^\d.,K]', '', count_text)
    
    if not count_text:
        return 0
    
    if 'K' in count_text:
        return int(float(count_text.replace('K', '')) * 1000)
    if 'k' in count_text:
        return int(float(count_text.replace('k', '')) * 1000)
    if 'M' in count_text:
        return int(float(count_text.replace('M', '')) * 1000000)
    if 'm' in count_text:
        return int(float(count_text.replace('m', '')) * 1000000)
    
    try:
        return int(float(count_text.replace(',', '')))
    except:
        return 0

def get_post_url(post_element, driver):
    """Extract the direct URL to the post"""
    try:
        # Try to find a permalink to the post
        post_links = post_element.find_elements(By.CSS_SELECTOR, "a.app-aware-link[href*='/feed/update/']")
        for link in post_links:
            href = link.get_attribute("href")
            if href and ("/feed/update/" in href or "/posts/" in href):
                return href
        
        # Try another method - click the timestamp/date which usually links to the post
        timestamp_links = post_element.find_elements(By.CSS_SELECTOR, "span.feed-shared-actor__sub-description a")
        if timestamp_links:
            return timestamp_links[0].get_attribute("href")
            
        # If still not found, try any link that might be the post URL
        all_links = post_element.find_elements(By.CSS_SELECTOR, "a[href*='/activity/']")
        for link in all_links:
            href = link.get_attribute("href")
            if "linkedin.com" in href and ("/activity/" in href or "/posts/" in href):
                return href
    except Exception as e:
        print(f"Error extracting post URL: {e}")
    
    return ""

def extract_post_details(post_element, driver):
    """Extract relevant details from a LinkedIn post"""
    post_data = {}
    
    try:
        # Extract text content
        try:
            # Try multiple possible selectors for post text
            text_selectors = [
                "span.break-words", 
                "div.feed-shared-update-v2__description-wrapper", 
                "div.feed-shared-text",
                "div.feed-shared-text > span",
                "div.update-components-text"
            ]
            
            post_text = ""
            for selector in text_selectors:
                text_elements = post_element.find_elements(By.CSS_SELECTOR, selector)
                if text_elements:
                    for elem in text_elements:
                        text = elem.text.strip()
                        if text:
                            post_text = text
                            break
                    if post_text:
                        break
            
            post_data["text"] = post_text
        except Exception as e:
            print(f"Error extracting post text: {e}")
            post_data["text"] = ""
        
        # Extract post date
        try:
            # Try different selectors for date
            date_selectors = [
                "span.feed-shared-actor__sub-description span.visually-hidden",
                "span.feed-shared-actor__sub-description", 
                "time",
                "span.feed-shared-actor__sub-description a"
            ]
            
            post_date = ""
            for selector in date_selectors:
                date_elements = post_element.find_elements(By.CSS_SELECTOR, selector)
                for date_elem in date_elements:
                    date_text = date_elem.text.strip() or date_elem.get_attribute("datetime")
                    if date_text and ("ago" in date_text or "h" in date_text or "d" in date_text or "w" in date_text or "mo" in date_text or any(month in date_text for month in ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"])):
                        post_date = parse_date(date_text)
                        break
                if post_date:
                    break

            # If date still not found, try using the aria-label of the timestamp
            if not post_date:
                time_elements = post_element.find_elements(By.CSS_SELECTOR, "time")
                for time_elem in time_elements:
                    aria_label = time_elem.get_attribute("aria-label")
                    if aria_label:
                        post_date = parse_date(aria_label)
                        break
                        
            post_data["date"] = post_date
        except Exception as e:
            print(f"Error extracting post date: {e}")
            post_data["date"] = ""
        
        # Extract reaction counts (likes)
        try:
            # Try different selectors for reaction counts
            reaction_selectors = [
                "span.social-details-social-counts__reactions-count",
                "button.social-details-social-counts__count-value", 
                "span.social-details-social-counts__social-proof-fallback-number",
                "button.reactions-react-button span"
            ]
            
            reactions = 0
            for selector in reaction_selectors:
                reaction_elements = post_element.find_elements(By.CSS_SELECTOR, selector)
                for elem in reaction_elements:
                    try:
                        reaction_text = elem.text.strip() or elem.get_attribute("aria-label")
                        if reaction_text and any(char.isdigit() for char in reaction_text):
                            reactions = parse_count(reaction_text)
                            break
                    except:
                        continue
                if reactions > 0:
                    break
                    
            post_data["reactions"] = reactions
        except Exception as e:
            print(f"Error extracting reactions: {e}")
            post_data["reactions"] = 0
                
        # Extract comments count
        try:
            # Look for comments text that has format like "13 comments"
            comment_elements = post_element.find_elements(By.XPATH, ".//*[contains(text(), 'comment')]")
            comments_count = 0
            
            for elem in comment_elements:
                text = elem.text.strip()
                if "comment" in text.lower():
                    # Extract number before "comment"
                    match = re.search(r'(\d+)\s+comment', text.lower())
                    if match:
                        comments_count = int(match.group(1))
                        break
            
            post_data["comments_count"] = comments_count
        except Exception as e:
            print(f"Error extracting comments count: {e}")
            post_data["comments_count"] = 0
                
        # Extract reposts (shares) count
        try:
            # Look for text containing "repost" or "share"
            share_elements = post_element.find_elements(By.XPATH, ".//*[contains(text(), 'repost') or contains(text(), 'share')]")
            shares_count = 0
            
            for elem in share_elements:
                text = elem.text.strip()
                if "repost" in text.lower() or "share" in text.lower():
                    # Extract number before "repost" or "share"
                    match = re.search(r'(\d+)\s+(repost|share)', text.lower())
                    if match:
                        shares_count = int(match.group(1))
                        break
            
            post_data["reposts_count"] = shares_count
        except Exception as e:
            print(f"Error extracting reposts count: {e}")
            post_data["reposts_count"] = 0
            
        # Extract post URL
        post_data["post_url"] = get_post_url(post_element, driver)
        
        # Extract comments if available and count > 0
        if post_data["comments_count"] > 0:
            try:
                # First check if comments are expanded, if not try to expand them
                comments_section = post_element.find_elements(By.CSS_SELECTOR, ".comments-comments-list")
                
                # If comments aren't visible yet but there are some, try to click to expand
                if not comments_section:
                    try:
                        # Look for "View/show comments" or similar buttons
                        comment_buttons = post_element.find_elements(By.XPATH, 
                            ".//*[contains(text(), 'View comment') or contains(text(), 'Show comment') or contains(text(), 'See comment')]")
                        
                        for btn in comment_buttons:
                            if btn.is_displayed():
                                driver.execute_script("arguments[0].click();", btn)
                                time.sleep(1)  # Wait for comments to load
                                break
                                
                        # Check again for comments section
                        comments_section = post_element.find_elements(By.CSS_SELECTOR, ".comments-comments-list")
                    except:
                        pass
                        
                if comments_section:
                    comments = []
                    comment_elements = post_element.find_elements(By.CSS_SELECTOR, ".comments-comment-item")
                    
                    for comment in comment_elements[:5]:  # Limit to first 5 comments
                        try:
                            comment_author = ""
                            comment_text = ""
                            
                            # Try to get comment author
                            author_elements = comment.find_elements(By.CSS_SELECTOR, ".comments-post-meta__name-text, .actor__name")
                            if author_elements:
                                comment_author = author_elements[0].text.strip()
                            
                            # Try to get comment text
                            text_elements = comment.find_elements(By.CSS_SELECTOR, ".comments-comment-item__main-content, .feed-shared-text")
                            if text_elements:
                                comment_text = text_elements[0].text.strip()
                            
                            if comment_text:
                                comments.append({
                                    "author": comment_author,
                                    "text": comment_text
                                })
                        except Exception as e:
                            print(f"Error extracting individual comment: {e}")
                    
                    post_data["comments"] = comments
                else:
                    post_data["comments"] = []
            except Exception as e:
                print(f"Error extracting comments: {e}")
                post_data["comments"] = []
        else:
            post_data["comments"] = []
            
        return post_data
        
    except Exception as e:
        print(f"Error processing post: {e}")
        return {"text": "", "error": str(e)}

def scrape_posts(driver, account_url, num_posts=100):
    driver.get(account_url + "/posts/")
    time.sleep(5)  # Give page time to load initially
    
    posts = []
    seen_post_texts = set()  # Use post text as unique identifier
    last_height = driver.execute_script("return document.body.scrollHeight")
    scroll_attempts = 0
    max_scroll_attempts = 20  # As requested
    
    while len(posts) < num_posts and scroll_attempts < max_scroll_attempts:
        # Scroll down to load more posts
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)  # Wait for content to load
        
        # Try to find all post elements with multiple selectors for better coverage
        post_selectors = [
            "div.feed-shared-update-v2", 
            "div.occludable-update",
            "div[data-urn]",
            "li.occludable-update"
        ]
        
        elements = []
        for selector in post_selectors:
            elements.extend(driver.find_elements(By.CSS_SELECTOR, selector))
        
        print(f"Found {len(elements)} post elements")
        
        for element in elements:
            if len(posts) >= num_posts:
                break
                
            # Extract post details
            post_details = extract_post_details(element, driver)
            
            # Skip posts with no text
            if not post_details.get("text"):
                continue
                
            # Skip posts we've already seen (comparing by text content)
            post_text = post_details.get("text", "")[:100]  # Use first 100 chars for comparison
            if post_text in seen_post_texts:
                continue
                
            # Add the post to our results
            posts.append(post_details)
            seen_post_texts.add(post_text)
            print(f"Post #{len(posts)} extracted: {post_details.get('date')} - {post_text[:50]}...")
        
        # Check if we've reached the end of the page
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            # Try to click "Show more" or similar buttons
            try:
                show_more_buttons = driver.find_elements(By.XPATH, 
                    "//*[contains(text(), 'Show more') or contains(text(), 'Load more') or contains(text(), 'See more')]")
                
                if show_more_buttons:
                    for button in show_more_buttons:
                        if button.is_displayed():
                            driver.execute_script("arguments[0].click();", button)
                            time.sleep(2)
                            scroll_attempts = 0  # Reset counter if we found buttons
                            break
                    else:
                        scroll_attempts += 1
                else:
                    scroll_attempts += 1
            except:
                scroll_attempts += 1
                
            print(f"No new content loaded, attempt {scroll_attempts}/{max_scroll_attempts}")
            time.sleep(2)
        else:
            scroll_attempts = 0  # Reset counter when new content is loaded
            
        last_height = new_height
        
    print(f"Extracted {len(posts)} posts total")
    return posts

def save_to_csv(posts, filename):
    if not posts:
        print("No posts to save")
        return
        
    # Only keep the fields we want
    essential_fields = ["text", "date", "reactions", "comments_count", "reposts_count", "post_url", "comments"]
    
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=essential_fields)
        writer.writeheader()
        
        for post in posts:
            # Only keep essential fields
            filtered_post = {field: post.get(field, "") for field in essential_fields}
            
            # Convert comments list to string
            if "comments" in filtered_post and filtered_post["comments"]:
                filtered_post["comments"] = json.dumps(filtered_post["comments"], ensure_ascii=False)
                
            writer.writerow(filtered_post)
    
    print(f"Saved {len(posts)} posts to {filename}")

def save_to_json(posts, filename):
    # Only keep the fields we want
    essential_fields = ["text", "date", "reactions", "comments_count", "reposts_count", "post_url", "comments"]
    
    filtered_posts = []
    for post in posts:
        filtered_post = {field: post.get(field, "") for field in essential_fields}
        filtered_posts.append(filtered_post)
        
    with open(filename, 'w', encoding='utf-8') as file:
        json.dump(filtered_posts, file, ensure_ascii=False, indent=2)
    
    print(f"Saved {len(posts)} posts to {filename} in JSON format")

# Main workflow
username = "william.masquelier93@gmail.com"
password = "6868Alexis"
account_url = "https://www.linkedin.com/company/iluka-resources/"

# Configure webdriver with options for better stability
options = webdriver.ChromeOptions()
options.add_argument("--disable-notifications")  # Disable notifications
options.add_argument("--start-maximized")  # Start maximized
options.add_argument("--disable-extensions")  # Disable extensions
options.add_argument("--disable-popup-blocking")  # Disable popup blocking

driver = webdriver.Chrome(options=options)

try:
    login_to_linkedin(driver, username, password)
    posts = scrape_posts(driver, account_url, num_posts=100)
    
    # Save in both CSV and JSON formats
    save_to_csv(posts, "linkedin_posts.csv")
    save_to_json(posts, "linkedin_posts.json")
    
    print(f"Successfully extracted {len(posts)} posts with detailed information")
    
finally:
    driver.quit()

print("Scraping completed.")

Login successful
Found 34 post elements
Post #1 extracted:  - Iluka Resources employees from the Perth office ha...
Post #2 extracted:  - We've been a proud partner of Foodbank WA since 20...
Post #3 extracted:  - Iluka Resources was proud to be a Silver Sponsor o...
Post #4 extracted:  - Iluka’s partnership with Yamatji Southern Regional...
Post #5 extracted:  - Iluka Resources is hosting an Open Day at our Cata...
Post #6 extracted:  - Iluka’s Jacinth-Ambrosia operation in South Austra...
Post #7 extracted:  - On behalf of Iluka Resources and our South West te...
Post #8 extracted:  - Iluka’s Adelaide-based team members walked, ran an...
Post #9 extracted:  - Iluka Resources is proud to partner with the Clont...
Found 40 post elements
Post #10 extracted:  - Representatives from Iluka’s Perth corporate offic...
Post #11 extracted:  - Iluka Resources is committed to supporting and del...
Post #12 extracted:  - What International Women’s Day means to Corrie-Lyn...
Found 40 post elements

In [18]:
posts

[{'text': 'Iluka Resources employees from the Perth office have pedalled for a purpose, raising funds for Starfish Nippers, Steve Waugh Foundation and Give A Feed.\n\nThey formed two teams to ride stationary bikes for 20 minutes in the recent 10th Charity Spin Roadshow organised by corporate fitness provider Motivate Ultra.\n\nParticipants from companies across nine office buildings within the Perth CBD helped to raise a total of $65,000.\n\nWhile Iluka’s teams missed out on prizes for total kilometres cycled, they were thrilled to be recognised for their music playlists as well as their creative outfits.',
  'date': '',
  'reactions': 270,
  'comments_count': 0,
  'reposts_count': 0,
  'post_url': '',
  'comments': []},
 {'text': "We've been a proud partner of Foodbank WA since 2020, enabling the Geraldton branch to materially increase its food assistance and related services in Western Australia's Mid West region. In Perth, our people are volunteering in Foodbank's kitchen and wareho

In [10]:
import os
import json
import time
from datetime import datetime

import pandas as pd
from bs4 import BeautifulSoup
import html2text
import tiktoken

# For Selenium
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# For OpenAI
from openai import OpenAI

# Set your OpenAI API key
# os.environ['OPENAI_API_KEY'] = 'your-api-key-here'  # Uncomment and add your key if not using .env

# Model pricing
pricing = {
    "gpt-4o-mini": {
        "input": 0.150 / 1_000_000,
        "output": 0.600 / 1_000_000,
    },
    "gpt-4o-2024-08-06": {
        "input": 2.5 / 1_000_000,
        "output": 10 / 1_000_000,
    },
}

def fetch_page_content(url):
    """Fetch the content of a webpage using Selenium with headless Chrome."""
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    
    try:
        # Use webdriver_manager to handle ChromeDriver installation
        driver = webdriver.Chrome(
            service=Service(ChromeDriverManager().install()),
            options=options
        )
        
        print(f"Fetching content from {url}...")
        driver.get(url)
        
        # Wait for page to load and scroll down to load more content
        time.sleep(5)
        
        # Scroll down to ensure dynamic content is loaded
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
        time.sleep(2)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)
        
        # Get page source
        html_content = driver.page_source
        
        # Convert to markdown for easier processing
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Convert to markdown
        markdown_converter = html2text.HTML2Text()
        markdown_converter.ignore_links = False
        markdown_content = markdown_converter.handle(str(soup))
        
        return markdown_content
    
    except Exception as e:
        print(f"Error fetching page: {e}")
        return None
    
    finally:
        if 'driver' in locals():
            driver.quit()

def extract_social_media_data(markdown_content, model="gpt-4o-mini"):
    """Extract structured data from the page content using OpenAI."""
    client = OpenAI()
    
    # Define what we want to extract
    system_message = """Extract information from social media posts in the following format:
    
    [
      {
        "Account_Name": "Name of the account that posted",
        "Followers_Count": "Number of followers (just the number)",
        "Post_Time": "When the post was made (e.g., '6mo', '4mo')",
        "Post_Text": "The complete text of the post",
        "Reactions_Count": "Number of reactions/likes (just the number)",
        "Comments_Count": "Number of comments (just the number)",
        "Reposts_Count": "Number of shares/reposts (just the number)"
      },
      // Additional posts...
    ]
    
    Return only the JSON array with no extra text. For missing information, use an empty string or 0."""
    
    # Call the API
    print(f"Extracting structured data using {model}...")
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": markdown_content}
        ],
        response_format={"type": "json_object"}
    )
    
    # Parse the response
    result_text = response.choices[0].message.content
    result_json = json.loads(result_text)
    
    # Convert to DataFrame
    if "posts" in result_json:
        df = pd.DataFrame(result_json["posts"])
    else:
        # If the model didn't use a "posts" key, assume the JSON is the array itself
        first_key = next(iter(result_json.keys()), None)
        if first_key:
            df = pd.DataFrame(result_json[first_key])
        else:
            # If no key structure, assume it's a direct array
            df = pd.DataFrame(result_json)
    
    # Calculate tokens and cost
    encoder = tiktoken.encoding_for_model(model)
    input_tokens = len(encoder.encode(markdown_content))
    output_tokens = len(encoder.encode(result_text))
    cost = (input_tokens * pricing[model]["input"]) + (output_tokens * pricing[model]["output"])
    
    print(f"Input tokens: {input_tokens}")
    print(f"Output tokens: {output_tokens}")
    print(f"Estimated cost: ${cost:.6f}")
    
    return {
        "dataframe": df,
        "json_data": result_json,
        "input_tokens": input_tokens,
        "output_tokens": output_tokens,
        "cost": cost
    }

# Main function to run the scraper
def scrape_social_media(url, model="gpt-4o-mini"):
    """Scrape social media data from the provided URL."""
    # Fetch page content
    content = fetch_page_content(url)
    
    if content:
        # Extract structured data
        results = extract_social_media_data(content, model)
        
        # Save results to files
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        
        # Save JSON
        with open(f"social_media_data_{timestamp}.json", "w") as f:
            json.dump(results["json_data"], f, indent=2)
        
        # Save CSV
        results["dataframe"].to_csv(f"social_media_data_{timestamp}.csv", index=False)
        
        print(f"Data saved to social_media_data_{timestamp}.json and .csv")
        
        return results
    
    return None

In [8]:
import os
import time
import re
import json
from datetime import datetime
from typing import List, Dict, Type

import pandas as pd
from bs4 import BeautifulSoup
from pydantic import BaseModel, Field, create_model
import html2text
import tiktoken

from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager  # Add this import

from openai import OpenAI

load_dotenv()  # Load environment variables from .env file

# Define the pricing for models
pricing = {
    "gpt-4o-mini": {
        "input": 0.150 / 1_000_000,  # $0.150 per 1M input tokens
        "output": 0.600 / 1_000_000,  # $0.600 per 1M output tokens
    },
    "gpt-4o-2024-08-06": {
        "input": 2.5 / 1_000_000,    # $2.50 per 1M input tokens
        "output": 10 / 1_000_000,    # $10.00 per 1M output tokens
    },
}

def setup_selenium():
    """Set up Selenium WebDriver with appropriate options using webdriver_manager."""
    options = Options()
    
    # Add headless option for server environments
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--no-sandbox")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    
    # Use webdriver_manager to automatically download and manage the appropriate ChromeDriver
    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=options
    )
    return driver

def fetch_html_selenium(url):
    """Fetch HTML content from a URL using Selenium."""
    driver = setup_selenium()
    try:
        driver.get(url)
        
        # Add random delays to mimic human behavior
        time.sleep(5)
        
        # Simulate scrolling
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)
        
        html = driver.page_source
        return html
    finally:
        driver.quit()

def clean_html(html_content):
    """Clean HTML by removing headers and footers."""
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Remove headers and footers
    for element in soup.find_all(['header', 'footer']):
        element.decompose()

    return str(soup)

def html_to_markdown_with_readability(html_content):
    """Convert HTML to markdown with improved readability."""
    cleaned_html = clean_html(html_content)
    
    # Convert to markdown
    markdown_converter = html2text.HTML2Text()
    markdown_converter.ignore_links = False
    markdown_content = markdown_converter.handle(cleaned_html)
    
    return markdown_content

def create_dynamic_listing_model(field_names: List[str]) -> Type[BaseModel]:
    """Create a dynamic Pydantic model based on provided fields."""
    field_definitions = {field: (str, ...) for field in field_names}
    return create_model('DynamicListingModel', **field_definitions)

def create_listings_container_model(listing_model: Type[BaseModel]) -> Type[BaseModel]:
    """Create a container model for a list of listing models."""
    return create_model('DynamicListingsContainer', listings=(List[listing_model], ...))

def trim_to_token_limit(text, model, max_tokens=200000):
    """Trim text to stay within token limits."""
    encoder = tiktoken.encoding_for_model(model)
    tokens = encoder.encode(text)
    if len(tokens) > max_tokens:
        trimmed_text = encoder.decode(tokens[:max_tokens])
        return trimmed_text
    return text

def format_data(data, DynamicListingsContainer, model_used="gpt-4o-mini"):
    """Format the extracted data using OpenAI."""
    client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

    system_message = """You are an intelligent text extraction and conversion assistant. Your task is to extract structured information 
                        from the given text and convert it into a pure JSON format. The JSON should contain only the structured data extracted from the text, 
                        with no additional commentary, explanations, or extraneous information.
                        
                        For social media posts, pay special attention to extracting:
                        - Account names and followers count
                        - Post timing (e.g., "6mo", "4mo")
                        - The complete post text (look for "...more" truncation indicators)
                        - Reaction counts (numbers preceding like/reaction icons)
                        - Comment counts (e.g., "1 comment" -> extract "1")
                        - Repost counts (e.g., "5 reposts" -> extract "5")
                        
                        Handle cases where some data might be missing, and try to extract as much as possible from each post.
                        Please process the following text and provide the output in pure JSON format with no words before or after the JSON:"""

    user_message = f"Extract the following information from the provided text:\nPage content:\n\n{data}"

    completion = client.beta.chat.completions.parse(
        model=model_used,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message},
        ],
        response_format=DynamicListingsContainer
    )
    return completion.choices[0].message.parsed

def calculate_price(input_text, output_text, model="gpt-4o-mini"):
    """Calculate token usage and price."""
    encoder = tiktoken.encoding_for_model(model)
    
    input_token_count = len(encoder.encode(input_text))
    output_token_count = len(encoder.encode(output_text))
    
    input_cost = input_token_count * pricing[model]["input"]
    output_cost = output_token_count * pricing[model]["output"]
    total_cost = input_cost + output_cost
    
    return input_token_count, output_token_count, total_cost

def scrape_website(url, fields=None, model_used="gpt-4o-mini"):
    """Main function to scrape a website and return structured data."""
    # Generate timestamp
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # If fields are not specified, use default social media post fields
    if fields is None:
        fields = ['Account_Name', 'Followers_Count', 'Post_Time', 'Post_Text', 
                 'Reactions_Count', 'Comments_Count', 'Reposts_Count']
    
    # Scrape data
    print(f"Fetching HTML from {url}...")
    raw_html = fetch_html_selenium(url)
    
    print("Converting HTML to markdown...")
    markdown = html_to_markdown_with_readability(raw_html)
    
    # Create the dynamic listing model
    print(f"Creating model for fields: {fields}")
    DynamicListingModel = create_dynamic_listing_model(fields)
    DynamicListingsContainer = create_listings_container_model(DynamicListingModel)
    
    # Enhance the system message to better extract social media elements
    system_message = """You are an intelligent text extraction assistant specializing in social media posts.
    Extract the following information from each post in the provided content:
    
    1. Account_Name: The name of the posting account
    2. Followers_Count: Number of followers (convert text like "38,682 followers" to just "38,682")
    3. Post_Time: When the post was made (e.g., "6mo", "4mo")
    4. Post_Text: The full text of the post
    5. Reactions_Count: Number of reactions/likes (e.g., "178")
    6. Comments_Count: Number of comments (e.g., "1 comment" should be "1")
    7. Reposts_Count: Number of reposts (e.g., "5 reposts" should be "5")
    
    Extract this information for each post and provide output in pure JSON format.
    """
    
    # Format data with the enhanced system message
    print(f"Extracting structured data using {model_used}...")
    formatted_data = format_data(markdown, DynamicListingsContainer, model_used)
    
    # Convert formatted_data to text for token counting
    formatted_data_text = json.dumps(formatted_data.dict())
    
    # Calculate token usage and cost
    input_tokens, output_tokens, total_cost = calculate_price(markdown, formatted_data_text, model=model_used)
    
    # Create DataFrame
    formatted_data_dict = formatted_data.dict()
    data_for_df = next(iter(formatted_data_dict.values()))
    df = pd.DataFrame(data_for_df)
    
    # Print statistics
    print(f"Input token count: {input_tokens}")
    print(f"Output token count: {output_tokens}")
    print(f"Estimated total cost: ${total_cost:.4f}")
    
    return {
        'dataframe': df,
        'formatted_data': formatted_data,
        'markdown': markdown,
        'input_tokens': input_tokens,
        'output_tokens': output_tokens,
        'total_cost': total_cost,
        'timestamp': timestamp
    }

In [11]:
# Social Media Scraper Example

# Install required packages if needed
# !pip install openai selenium webdriver_manager tiktoken bs4 html2text pandas

# Import the simplified scraper functions
# If you're importing from a file, use:
# from scraper import fetch_page_content, extract_social_media_data, scrape_social_media

# Set your OpenAI API key
import os
os.environ['OPENAI_API_KEY'] = 'your-api-key-here'  # Replace with your actual API key

# Target URL - replace with the URL you want to scrape
url = "https://www.linkedin.com/company/iluka-resources/posts/"

# Run the scraper
print("Starting social media scraper...")
results = scrape_social_media(url)

# Display the extracted data
print("\nExtracted social media posts:")
display(results["dataframe"])  # For Jupyter notebooks

# Optional: Show posts with the most reactions
sorted_by_reactions = results["dataframe"].sort_values(by="Reactions_Count", ascending=False)
print("\nPosts sorted by reaction count:")
display(sorted_by_reactions.head())

# Optional: Analyze post text
print("\nPost text analysis:")
for idx, row in results["dataframe"].iterrows():
    print(f"Post {idx+1} length: {len(row['Post_Text'])} characters")
    print(f"Posted: {row['Post_Time']}")
    print(f"Engagement: {row['Reactions_Count']} reactions, {row['Comments_Count']} comments, {row['Reposts_Count']} reposts")
    print("-" * 50)

Starting social media scraper...
Error fetching page: [Errno 8] Exec format error: '/Users/williammasquelier/.wdm/drivers/chromedriver/mac64/133.0.6943.141/chromedriver-mac-x64/THIRD_PARTY_NOTICES.chromedriver'

Extracted social media posts:


TypeError: 'NoneType' object is not subscriptable

In [12]:
# Import required packages
import os
import json
import time
import csv
from datetime import datetime

import pandas as pd
from bs4 import BeautifulSoup
import html2text
import tiktoken

# For Selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# For OpenAI
from openai import OpenAI

# Set your OpenAI API key
# os.environ['OPENAI_API_KEY'] = 'your-api-key-here'  # Uncomment and add your key if not using .env

# LinkedIn credentials - replace with your own
LINKEDIN_USERNAME = "your_email@example.com"
LINKEDIN_PASSWORD = "your_password"

# Model pricing
pricing = {
    "gpt-4o-mini": {
        "input": 0.150 / 1_000_000,
        "output": 0.600 / 1_000_000,
    },
    "gpt-4o-2024-08-06": {
        "input": 2.5 / 1_000_000,
        "output": 10 / 1_000_000,
    },
}

def login_to_linkedin(driver, username, password):
    """Log in to LinkedIn using provided credentials."""
    print("Logging in to LinkedIn...")
    driver.get("https://www.linkedin.com/login")
    
    # Enter username and password
    driver.find_element(By.ID, "username").send_keys(username)
    driver.find_element(By.ID, "password").send_keys(password)
    
    # Submit the form
    driver.find_element(By.CSS_SELECTOR, "button[type='submit']").click()
    
    # Wait for the feed to load
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.feed-shared-update-v2"))
        )
        print("Login successful")
    except Exception as e:
        print(f"Warning: Login page transition issue: {e}")
        # Continue anyway as sometimes the wait condition isn't perfect

def fetch_linkedin_content(url, username=LINKEDIN_USERNAME, password=LINKEDIN_PASSWORD, num_posts=10):
    """Fetch LinkedIn posts using your working driver configuration."""
    driver = webdriver.Chrome()  # Using your working driver configuration
    
    try:
        # Login to LinkedIn
        login_to_linkedin(driver, username, password)
        
        print(f"Navigating to {url}")
        driver.get(url)
        
        # Wait for the page to load
        time.sleep(5)
        
        # Scroll to load more content
        print("Scrolling to load more posts...")
        last_height = driver.execute_script("return document.body.scrollHeight")
        
        scroll_count = 0
        max_scrolls = 5  # Adjust based on how many posts you want to collect
        
        while scroll_count < max_scrolls:
            # Scroll down
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(3)  # Wait for content to load
            
            # Calculate new scroll height
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break  # Break if we've reached the bottom
                
            last_height = new_height
            scroll_count += 1
        
        # Get the HTML content
        html_content = driver.page_source
        
        # Convert to markdown for easier processing
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Convert to markdown
        markdown_converter = html2text.HTML2Text()
        markdown_converter.ignore_links = False
        markdown_content = markdown_converter.handle(str(soup))
        
        # Also collect raw post data as a backup
        print("Extracting post elements...")
        raw_posts = []
        post_elements = driver.find_elements(By.CSS_SELECTOR, "div.feed-shared-update-v2")
        
        for element in post_elements[:num_posts]:
            try:
                # Extract basic post data
                post_data = {}
                
                # Get account name
                try:
                    account_element = element.find_element(By.CSS_SELECTOR, "span.feed-shared-actor__name")
                    post_data["Account_Name"] = account_element.text.strip()
                except:
                    post_data["Account_Name"] = ""
                
                # Get post text
                try:
                    text_element = element.find_element(By.CSS_SELECTOR, "span.break-words")
                    post_data["Post_Text"] = text_element.text.strip()
                except:
                    post_data["Post_Text"] = ""
                
                # Get reactions count
                try:
                    reactions_element = element.find_element(By.CSS_SELECTOR, "span.social-details-social-counts__reactions-count")
                    post_data["Reactions_Count"] = reactions_element.text.strip()
                except:
                    post_data["Reactions_Count"] = "0"
                
                # Get comments count
                try:
                    comments_element = element.find_element(By.CSS_SELECTOR, "button.social-details-social-counts__comments")
                    post_data["Comments_Count"] = comments_element.text.strip().split()[0]
                except:
                    post_data["Comments_Count"] = "0"
                
                # Get post time
                try:
                    time_element = element.find_element(By.CSS_SELECTOR, "span.feed-shared-actor__sub-description")
                    post_data["Post_Time"] = time_element.text.strip()
                except:
                    post_data["Post_Time"] = ""
                
                # Add to raw posts
                raw_posts.append(post_data)
                print(f"Extracted post: {post_data['Post_Text'][:50]}...")
                
            except Exception as e:
                print(f"Error extracting post: {e}")
        
        return {
            "markdown_content": markdown_content,
            "raw_posts": raw_posts
        }
        
    except Exception as e:
        print(f"Error during LinkedIn scraping: {e}")
        return None
        
    finally:
        print("Closing browser...")
        driver.quit()

def extract_social_media_data(page_content, raw_posts=None, model="gpt-4o-mini"):
    """Extract structured data from the page content using OpenAI."""
    client = OpenAI()
    
    # Combine AI extraction with directly extracted data
    if raw_posts and len(raw_posts) > 0:
        print(f"Using {len(raw_posts)} directly extracted posts...")
        
        # Convert to DataFrame
        df = pd.DataFrame(raw_posts)
        
        # Clean up data formats
        for col in ["Reactions_Count", "Comments_Count"]:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col].str.replace(',', '').str.replace('K', '000'), errors='coerce').fillna(0).astype(int)
        
        # Add missing columns if needed
        if "Followers_Count" not in df.columns:
            df["Followers_Count"] = ""
        if "Reposts_Count" not in df.columns:
            df["Reposts_Count"] = 0
            
        return {
            "dataframe": df,
            "json_data": {"posts": raw_posts},
            "input_tokens": 0,
            "output_tokens": 0,
            "cost": 0
        }
    
    # If direct extraction failed or is incomplete, use OpenAI
    print(f"Extracting structured data using {model}...")
    
    # Define what we want to extract
    system_message = """Extract information from LinkedIn posts in the following format:
    
    {
      "posts": [
        {
          "Account_Name": "Name of the account that posted",
          "Followers_Count": "Number of followers (just the number)",
          "Post_Time": "When the post was made (e.g., '6mo', '4mo')",
          "Post_Text": "The complete text of the post",
          "Reactions_Count": "Number of reactions/likes (just the number)",
          "Comments_Count": "Number of comments (just the number)",
          "Reposts_Count": "Number of shares/reposts (just the number)"
        },
        // Additional posts...
      ]
    }
    
    Return only the JSON with no extra text. For missing information, use an empty string or 0."""
    
    # Call the API
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": page_content}
        ],
        response_format={"type": "json_object"}
    )
    
    # Parse the response
    result_text = response.choices[0].message.content
    result_json = json.loads(result_text)
    
    # Convert to DataFrame
    if "posts" in result_json:
        df = pd.DataFrame(result_json["posts"])
    else:
        # If the model didn't use a "posts" key, assume the JSON is the array itself
        first_key = next(iter(result_json.keys()), None)
        if first_key:
            df = pd.DataFrame(result_json[first_key])
        else:
            # If no key structure, assume it's a direct array
            df = pd.DataFrame(result_json)
    
    # Calculate tokens and cost
    encoder = tiktoken.encoding_for_model(model)
    input_tokens = len(encoder.encode(page_content))
    output_tokens = len(encoder.encode(result_text))
    cost = (input_tokens * pricing[model]["input"]) + (output_tokens * pricing[model]["output"])
    
    print(f"Input tokens: {input_tokens}")
    print(f"Output tokens: {output_tokens}")
    print(f"Estimated cost: ${cost:.6f}")
    
    return {
        "dataframe": df,
        "json_data": result_json,
        "input_tokens": input_tokens,
        "output_tokens": output_tokens,
        "cost": cost
    }

# Main function to run the scraper
def scrape_linkedin_posts(url, username=LINKEDIN_USERNAME, password=LINKEDIN_PASSWORD, model="gpt-4o-mini"):
    """Scrape LinkedIn posts from the provided URL."""
    # Fetch page content
    content_data = fetch_linkedin_content(url, username, password)
    
    if content_data:
        # Extract structured data (first try with directly extracted posts)
        results = extract_social_media_data(
            content_data["markdown_content"], 
            content_data["raw_posts"], 
            model
        )
        
        # Save results to files
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        
        # Save JSON
        with open(f"linkedin_data_{timestamp}.json", "w", encoding='utf-8') as f:
            json.dump(results["json_data"], f, indent=2)
        
        # Save CSV
        results["dataframe"].to_csv(f"linkedin_data_{timestamp}.csv", index=False, encoding='utf-8')
        
        print(f"Data saved to linkedin_data_{timestamp}.json and .csv")
        
        return results
    
    return None

In [13]:
# LinkedIn Posts Scraper in Jupyter Notebook

# Install required packages if needed
# !pip install selenium beautifulsoup4 html2text openai pandas tiktoken

# Set your LinkedIn credentials
LINKEDIN_USERNAME = "william.masquelier93@gmail.com"  # Your LinkedIn email
LINKEDIN_PASSWORD = "6868Alexis"  # Your LinkedIn password


# Import the scraper functions (you can copy-paste the entire integrated-scraper code here)
# OR if you saved the code as a module:
# from linkedin_scraper import scrape_linkedin_posts, fetch_linkedin_content, extract_social_media_data

# URL to scrape
url = "https://www.linkedin.com/company/iluka-resources/posts/?feedView=all"

# Run the scraper
print("Starting LinkedIn scraper...")
results = scrape_linkedin_posts(url, LINKEDIN_USERNAME, LINKEDIN_PASSWORD)

# Display the extracted data
if results:
    print("\nExtracted LinkedIn posts:")
    display(results["dataframe"])  # Jupyter display function
    
    # Show statistics
    print(f"\nTotal posts extracted: {len(results['dataframe'])}")
    
    # If we used the OpenAI extraction (not direct)
    if results["input_tokens"] > 0:
        print(f"OpenAI tokens used: {results['input_tokens']} input, {results['output_tokens']} output")
        print(f"Estimated OpenAI cost: ${results['cost']:.6f}")
    
    # Optional: Show posts with most engagement
    if "Reactions_Count" in results["dataframe"].columns:
        sorted_posts = results["dataframe"].sort_values(by="Reactions_Count", ascending=False)
        print("\nTop posts by engagement:")
        display(sorted_posts.head(3))
    
    # Analyze post content (example analysis)
    print("\nPost text analysis:")
    post_lengths = results["dataframe"]["Post_Text"].str.len()
    print(f"Average post length: {post_lengths.mean():.1f} characters")
    print(f"Longest post: {post_lengths.max()} characters")
    print(f"Shortest post: {post_lengths.min()} characters")
else:
    print("Failed to extract LinkedIn posts. Check your credentials and URL.")

Starting LinkedIn scraper...


Error sending stats to Plausible: error sending request for url (https://plausible.io/api/event)


Logging in to LinkedIn...
Login successful
Navigating to https://www.linkedin.com/company/iluka-resources/posts/?feedView=all
Scrolling to load more posts...
Extracting post elements...
Extracted post: Iluka Resources employees from the Perth office ha...
Extracted post: We've been a proud partner of Foodbank WA since 20...
Extracted post: Iluka Resources was proud to be a Silver Sponsor o...
Extracted post: Iluka’s partnership with Yamatji Southern Regional...
Extracted post: Iluka’s Jacinth-Ambrosia operation in South Austra...
Extracted post: On behalf of Iluka Resources and our South West te...
Extracted post: Iluka’s Adelaide-based team members walked, ran an...
Extracted post: Iluka Resources is proud to partner with the Clont...
Extracted post: Representatives from Iluka’s Perth corporate offic...
Extracted post: Iluka Resources is committed to supporting and del...
Closing browser...
Using 10 directly extracted posts...
Data saved to linkedin_data_20250227_134602.json and .csv


Unnamed: 0,Account_Name,Post_Text,Reactions_Count,Comments_Count,Post_Time,Followers_Count,Reposts_Count
0,,Iluka Resources employees from the Perth offic...,270,0,,,0
1,,We've been a proud partner of Foodbank WA sinc...,47,0,,,0
2,,Iluka Resources was proud to be a Silver Spons...,210,0,,,0
3,,Iluka’s partnership with Yamatji Southern Regi...,123,0,,,0
4,,Iluka’s Jacinth-Ambrosia operation in South Au...,146,0,,,0
5,,On behalf of Iluka Resources and our South Wes...,90,0,,,0
6,,"Iluka’s Adelaide-based team members walked, ra...",84,0,,,0
7,,Iluka Resources is proud to partner with the C...,243,0,,,0
8,,Representatives from Iluka’s Perth corporate o...,315,0,,,0
9,,Iluka Resources is committed to supporting and...,180,0,,,0



Total posts extracted: 10

Top posts by engagement:


Unnamed: 0,Account_Name,Post_Text,Reactions_Count,Comments_Count,Post_Time,Followers_Count,Reposts_Count
8,,Representatives from Iluka’s Perth corporate o...,315,0,,,0
0,,Iluka Resources employees from the Perth offic...,270,0,,,0
7,,Iluka Resources is proud to partner with the C...,243,0,,,0



Post text analysis:
Average post length: 447.9 characters
Longest post: 846 characters
Shortest post: 232 characters
