<a href="https://colab.research.google.com/github/Sidhtang/HOME..LLC/blob/main/linkdein_crwaler_anlysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install linkedin-api

Collecting linkedin-api
  Downloading linkedin_api-2.3.0-py3-none-any.whl.metadata (14 kB)
Collecting lxml<6.0.0,>=5.3.0 (from linkedin-api)
  Downloading lxml-5.3.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Downloading linkedin_api-2.3.0-py3-none-any.whl (26 kB)
Downloading lxml-5.3.0-cp310-cp310-manylinux_2_28_x86_64.whl (5.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lxml, linkedin-api
  Attempting uninstall: lxml
    Found existing installation: lxml 4.9.4
    Uninstalling lxml-4.9.4:
      Successfully uninstalled lxml-4.9.4
Successfully installed linkedin-api-2.3.0 lxml-5.3.0


In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import time
import random
from typing import List, Dict, Any
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

class LinkedInScraper:
    def __init__(self, email: str, password: str):
        """
        Initialize LinkedIn scraper with login credentials
        Args:
            email (str): LinkedIn login email
            password (str): LinkedIn login password
        """
        self.email = email
        self.password = password
        self.driver = None
        self.setup_driver()

    def setup_driver(self):
        """Set up Chrome driver with appropriate options"""
        chrome_options = Options()
        chrome_options.add_argument('--headless')  # Run in headless mode
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-notifications')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--window-size=1920,1080')

        # Add random user agent
        user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        ]
        chrome_options.add_argument(f'user-agent={random.choice(user_agents)}')

        self.driver = webdriver.Chrome(options=chrome_options)

    def login(self):
        """Login to LinkedIn"""
        try:
            self.driver.get('https://www.linkedin.com/login')
            time.sleep(random.uniform(2, 4))  # Random delay

            # Enter email
            email_elem = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.ID, "username"))
            )
            email_elem.send_keys(self.email)

            # Enter password
            password_elem = self.driver.find_element(By.ID, "password")
            password_elem.send_keys(self.password)

            # Click login button
            login_button = self.driver.find_element(By.CSS_SELECTOR, "[type='submit']")
            login_button.click()

            time.sleep(random.uniform(3, 5))  # Wait for login to complete

            return True
        except Exception as e:
            print(f"Login failed: {str(e)}")
            return False

    def get_profile_data(self, profile_url: str) -> Dict[str, Any]:
        """
        Scrape profile data from LinkedIn
        Args:
            profile_url (str): LinkedIn profile URL
        Returns:
            Dict[str, Any]: Dictionary containing profile data
        """
        try:
            self.driver.get(profile_url)
            time.sleep(random.uniform(2, 4))  # Random delay between requests

            # Wait for main content to load
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "pv-top-card"))
            )

            soup = BeautifulSoup(self.driver.page_source, 'html.parser')

            # Extract basic profile information
            profile_data = {
                'url': profile_url,
                'name': self._safe_extract(soup, "h1.text-heading-xlarge"),
                'headline': self._safe_extract(soup, "div.text-body-medium"),
                'company': self._safe_extract(soup, "span.pv-text-details__right-panel-item-text"),
                'location': self._safe_extract(soup, "span.text-body-small.inline"),
                'about': self._safe_extract(soup, "div.pv-shared-text-with-see-more"),
                'followers': self._extract_followers(soup),
                'connections': self._extract_connections(soup),
                'posts': self._get_recent_posts()
            }

            return profile_data

        except TimeoutException:
            print(f"Timeout while loading profile: {profile_url}")
            return {}
        except Exception as e:
            print(f"Error scraping profile {profile_url}: {str(e)}")
            return {}

    def _safe_extract(self, soup: BeautifulSoup, selector: str) -> str:
        """Safely extract text from BeautifulSoup element"""
        try:
            element = soup.select_one(selector)
            return element.get_text(strip=True) if element else ""
        except Exception:
            return ""

    def _extract_followers(self, soup: BeautifulSoup) -> int:
        """Extract number of followers"""
        try:
            followers_text = soup.find(text=lambda t: 'followers' in t.lower())
            if followers_text:
                return int(''.join(filter(str.isdigit, followers_text)))
            return 0
        except Exception:
            return 0

    def _extract_connections(self, soup: BeautifulSoup) -> int:
        """Extract number of connections"""
        try:
            connections_text = soup.find(text=lambda t: 'connections' in t.lower())
            if connections_text:
                return int(''.join(filter(str.isdigit, connections_text)))
            return 0
        except Exception:
            return 0

    def _get_recent_posts(self) -> List[Dict[str, Any]]:
        """Get recent posts data"""
        posts = []
        try:
            # Click "Posts" tab if it exists
            posts_tab = WebDriverWait(self.driver, 5).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "a[href*='recent-activity/shares/']"))
            )
            posts_tab.click()
            time.sleep(random.uniform(2, 3))

            # Get post elements
            post_elements = self.driver.find_elements(By.CSS_SELECTOR, "div.feed-shared-update-v2")

            for post in post_elements[:5]:  # Get last 5 posts
                try:
                    post_data = {
                        'timestamp': self._safe_find_element(post, "span.feed-shared-actor__sub-description"),
                        'text': self._safe_find_element(post, "div.feed-shared-text"),
                        'likes': self._extract_reaction_count(post, "likes"),
                        'comments': self._extract_reaction_count(post, "comments")
                    }
                    posts.append(post_data)
                except Exception:
                    continue

        except Exception as e:
            print(f"Error getting posts: {str(e)}")

        return posts

    def _safe_find_element(self, element, selector: str) -> str:
        """Safely find and extract text from element"""
        try:
            return element.find_element(By.CSS_SELECTOR, selector).text.strip()
        except NoSuchElementException:
            return ""

    def _extract_reaction_count(self, post_element, reaction_type: str) -> int:
        """Extract reaction count (likes or comments) from post"""
        try:
            selector = f"button.social-details-social-counts__reactions-count" if reaction_type == "likes" else "button.social-details-social-counts__comments-count"
            element = post_element.find_element(By.CSS_SELECTOR, selector)
            count_text = element.text.strip()
            return int(''.join(filter(str.isdigit, count_text)))
        except Exception:
            return 0

    def process_profiles(self, profile_urls: List[str]) -> pd.DataFrame:
        """
        Process multiple LinkedIn profiles
        Args:
            profile_urls (List[str]): List of LinkedIn profile URLs
        Returns:
            pd.DataFrame: DataFrame containing profile data
        """
        all_data = []

        try:
            if not self.login():
                raise Exception("Failed to login to LinkedIn")

            for url in profile_urls:
                try:
                    print(f"Processing profile: {url}")
                    profile_data = self.get_profile_data(url)
                    if profile_data:
                        all_data.append(profile_data)

                    # Random delay between profiles
                    time.sleep(random.uniform(3, 7))

                except Exception as e:
                    print(f"Error processing profile {url}: {str(e)}")
                    continue

        finally:
            if self.driver:
                self.driver.quit()

        return pd.DataFrame(all_data)

def analyze_profiles(df: pd.DataFrame) -> Dict[str, Any]:
    """
    Analyze profile data
    Args:
        df (pd.DataFrame): DataFrame containing profile data
    Returns:
        Dict[str, Any]: Dictionary containing analysis results
    """
    if df.empty:
        return {}

    analysis = {
        'total_profiles': len(df),
        'locations': df['location'].value_counts().to_dict(),
        'avg_followers': df['followers'].mean(),
        'avg_connections': df['connections'].mean(),
        'companies': df['company'].value_counts().to_dict()
    }

    # Analyze posts if available
    if 'posts' in df.columns:
        post_data = []
        for posts in df['posts']:
            if posts:
                for post in posts:
                    post_data.append({
                        'likes': post.get('likes', 0),
                        'comments': post.get('comments', 0)
                    })

        if post_data:
            post_df = pd.DataFrame(post_data)
            analysis.update({
                'avg_post_likes': post_df['likes'].mean(),
                'avg_post_comments': post_df['comments'].mean(),
                'max_post_likes': post_df['likes'].max(),
                'max_post_comments': post_df['comments'].max()
            })

    return analysis

def main(urls: List[str], email: str, password: str):
    """
    Main function to run the LinkedIn profile analysis
    Args:
        urls (List[str]): List of LinkedIn profile URLs
        email (str): LinkedIn login email
        password (str): LinkedIn login password
    """
    try:
        # Initialize scraper
        scraper = LinkedInScraper(email, password)

        # Process profiles
        df = scraper.process_profiles(urls)

        if df.empty:
            print("No data was collected. Please check your login credentials and URLs.")
            return

        # Analyze data
        analysis = analyze_profiles(df)

        # Print results
        print("\nAnalysis Results:")
        print(f"Total Profiles Analyzed: {analysis['total_profiles']}")
        print(f"\nLocation Distribution:")
        for loc, count in analysis['locations'].items():
            print(f"  {loc}: {count}")

        print(f"\nCompany Distribution:")
        for company, count in analysis['companies'].items():
            print(f"  {company}: {count}")

        print(f"\nEngagement Metrics:")
        print(f"  Average Followers: {analysis['avg_followers']:.2f}")
        print(f"  Average Connections: {analysis['avg_connections']:.2f}")

        if 'avg_post_likes' in analysis:
            print(f"\nPost Engagement:")
            print(f"  Average Likes per Post: {analysis['avg_post_likes']:.2f}")
            print(f"  Average Comments per Post: {analysis['avg_post_comments']:.2f}")
            print(f"  Highest Likes on a Post: {analysis['max_post_likes']}")
            print(f"  Highest Comments on a Post: {analysis['max_post_comments']}")

    except Exception as e:
        print(f"Error in main execution: {str(e)}")

if __name__ == "__main__":
    # Example usage:
    linkedin_urls = [
        # Your list of LinkedIn URLs here
    ]
    linkedin_email = "your_email@example.com"
    linkedin_password = "your_password"

    main(linkedin_urls, linkedin_email, linkedin_password)