# Import

In [2]:
import requests
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
from tqdm.notebook import tqdm 
import re

# Function

In [3]:
def scrape_pantip_profile(user_id):
    # Set up Chrome options and WebDriver this need to use selenium for dynamic content
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    
    #Construct the url with take the user_id and construct the url string
    url = f'http://www.pantip.com/profile/{user_id}#topics'
    driver.get(url)
    time.sleep(0.5)  #This can be adjusted, I put 0.5 just in case

    # Define the url of web source
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    # Close the WebDriver, to avoid memory leaks
    driver.quit()

    user_profile_information = {}

    # Extract user profile information
    user_name = soup.find('div', class_='b-block-title')
    user_name_text = user_name.find('h3').get_text(strip=True) if user_name else None
    user_profile_information['user_name'] = user_name_text

    user_content = soup.find('div', class_='b-block-content')
    if user_content:
        user_avatar = user_content.find('div', class_='big_avatar')
        user_avatar_url = user_avatar.find('img')['src'] if user_avatar else None
        user_bio = user_content.find('div', class_='profile-bio small-txt-fixed')
        user_bio_text = user_bio.find_all('span')
        user_desc = user_bio.find('p', class_='profile-desc')
        user_desc_text = user_desc.get_text(strip=True) if user_desc else None
        user_profile_stat_container = user_content.find('div', class_='profile-stat')
        if user_profile_stat_container:
            # Get all the following and follower text
            user_following_follower = user_profile_stat_container.get_text(strip=True)
            # Extract the following and follower text, using regex to target the patterns
            following_pattern = r'(ติดตาม)(\d+)'
            follower_pattern = r'(กำลังติดตาม)(\d+)'
            following = re.search(following_pattern, user_following_follower)
            follower = re.search(follower_pattern, user_following_follower)
            
            user_profile_information = {
                'user_name': user_name_text,
                'user_avatar': user_avatar_url,
                'user_bio': user_bio_text,
                'user_desc': user_desc_text,
                'follower': follower.group(2) if follower else '0',
                'following': following.group(2) if following else '0'
            }
            
    def extract_topic_index(url):
        match = re.search(r'/topic/(\d+)', url)
        return match.group(1) if match else None
            
    profile_extracted_data = {}

    # Find the wrapper containing the post titles
    wrapper = soup.find('div', class_='post-list-wrapper')

    if wrapper:
        post_items = wrapper.find_all('div', class_='post-item')
        
        # Wrap post_items with tqdm for progress tracking
        for post in tqdm(post_items, desc="Processing posts"):
            title_div = post.find('div', class_='post-item-title')
            if title_div and title_div.find('a'):
                title = title_div.find('a').get_text(strip=True)
                post_url = title_div.find('a')['href']
                # Extract topic id from the post url to be used as the key in the resulting dictionary
                title_id = extract_topic_index(post_url)
                
                # Author information
                owner_tag = post.find('div', class_='post-item-by')
                owner_text = None
                timestamp_text = None
                if owner_tag:
                    owner = owner_tag.find('span', class_='by-name')
                    owner_text = owner.get_text(strip=True) if owner else None
                    
                    # Timestamp
                    timestamp = owner_tag.find('span', class_='timestamp')
                    if timestamp:
                        timestamp_text = timestamp.find('abbr', class_='timeago')
                        timestamp_text = timestamp_text['data-utime'] if timestamp_text else None
                
                # Comment count
                comment_status = post.find('div', class_='post-item-status-i', title=lambda x: 'ความคิดเห็น' in x if x else False)
                comment_text = comment_status.get_text(strip=True) if comment_status else '0'
                
                # Tag information
                tags_data = post.find('div', class_='post-item-footer')
                tag_list = tags_data.find('div', class_='post-item-taglist')
                tag_title = None
                tag_title_2 = None
                if tag_list:
                    tag_element = tag_list.find('a', class_='tag-title')
                    if tag_element:
                        tag_title = tag_element.get('data-tag')
                        tag_title_2 = tag_element.get('href')
                
                #Topic id is a key of the profile_extracted_data
                profile_extracted_data[title_id] = {
                    'title': title,
                    'url': post_url,
                    'owner': owner_text,
                    'timestamp': timestamp_text,
                    'comment_count': comment_text,
                    'tag': tag_title,
                    'tag_url': tag_title_2
                }

    # Add the profile feed to the user profile information
    user_profile_information['profile_feed'] = profile_extracted_data

    return user_profile_information

In [4]:
#We don't use the url anymore, because we can construct the url from the user_id
# url = 'https://pantip.com/profile/6905372#topics'

In [5]:
#This can be a list of user_id and we can use for loop to iterate over the list
user_id = '6905372'

In [6]:
user_profile_information = scrape_pantip_profile(user_id)

Processing posts:   0%|          | 0/19 [00:00<?, ?it/s]

In [9]:
user_profile_information['profile_feed']['42783101']

{'title': 'เปิด 7 เหตุผลที่ผู้ชมรอดูละคร "หวานรักต้องห้าม" พ.ศ. 2567',
 'url': 'https://pantip.com/topic/42783101',
 'owner': 'สมาชิกหมายเลข 6905372',
 'timestamp': '06/15/2024 22:12:48',
 'comment_count': '33',
 'tag': '3 HD (BEC)',
 'tag_url': '/tag/3_HD_(BEC)'}

In [10]:
user_profile_information['profile_feed']

for topic_id in user_profile_information['profile_feed']:
    print(f"Topic ID: {topic_id}")
    print(user_profile_information['profile_feed'][topic_id])
    print("\n")

Topic ID: 42783101
{'title': 'เปิด 7 เหตุผลที่ผู้ชมรอดูละคร "หวานรักต้องห้าม" พ.ศ. 2567', 'url': 'https://pantip.com/topic/42783101', 'owner': 'สมาชิกหมายเลข 6905372', 'timestamp': '06/15/2024 22:12:48', 'comment_count': '33', 'tag': '3 HD (BEC)', 'tag_url': '/tag/3_HD_(BEC)'}


Topic ID: 42743716
{'title': 'ติเพื่อก่อ "จนกว่าจะได้รักกัน" ขอส่งสารไปถึงช่อง 3 จากแฟนช่องคนนี้', 'url': 'https://pantip.com/topic/42743716', 'owner': 'สมาชิกหมายเลข 6905372', 'timestamp': '05/29/2024 16:57:57', 'comment_count': '45', 'tag': '3 HD (BEC)', 'tag_url': '/tag/3_HD_(BEC)'}


Topic ID: 42688621
{'title': 'ขวัญฤทัย ปลุกความคึกคักให้ละครไทยสุดสัปดาห์', 'url': 'https://pantip.com/topic/42688621', 'owner': 'สมาชิกหมายเลข 6905372', 'timestamp': '05/04/2024 21:53:13', 'comment_count': '22', 'tag': '3 HD (BEC)', 'tag_url': '/tag/3_HD_(BEC)'}


Topic ID: 42521503
{'title': 'NAT เล่นท่ายาก แต่คะแนนนำโด่ง 10 10 10', 'url': 'https://pantip.com/topic/42521503', 'owner': 'สมาชิกหมายเลข 6905372', 'timestamp': '02