In [1]:
import html
import http.client
import requests

import time
import json
import re
from functools import reduce
import os
import pathlib
from argparse import ArgumentParser

import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

In [9]:
def findAndExtract(soupElement, tag = 'div', **kwargs):
  element = soupElement.find(tag, **kwargs)
  return element.contents[0].strip()

def requestTitle(ID, verbose = True):
  URL = f"https://pantip.com/topic/{ID}"
  page = requests.get(URL)
  
  pageSoup = BeautifulSoup(page.content, 'html.parser')

  titleSoup = pageSoup.find(class_='main-post-inner')

  posterName = findAndExtract(titleSoup, 'a', class_='display-post-name owner')
  title = findAndExtract(titleSoup, 'h2', class_='display-post-title')
  content = findAndExtract(titleSoup, 'div', class_='display-post-story')
  if(verbose):
    print(f'  ├─ Post title : {title}')
  
  return (content, posterName)

In [10]:
def cleanHTML(string):
  return html.unescape(string.strip()).replace('<br />', '')

def getComment(ID, page = 1):
  conn = http.client.HTTPSConnection("pantip.com")
  payload = ""

  headers = {
      'X-Requested-With': "XMLHttpRequest",
      'Referer': f"https://pantip.com/topic/{ID}"
      }

  conn.request("GET", f"/forum/topic/render_comments?tid={ID}&page={page}&param=page{page}&parent={page}", payload, headers)

  res = conn.getresponse()
  data = res.read()
  response = json.loads(data)
  return response

def requestComments(ID, verbose = True):
  response = getComment(ID)

  amount = response['count']
  pages = response['paging']['ed']['max']
  commentList = []
  for comment in response['comments']:
    commentList.append((cleanHTML(comment['message']), cleanHTML(comment['user']['name'])))
  
  for i in range(1, pages):
    response = getComment(ID, i + 1)
    for comment in response['comments']:
      commentList.append((cleanHTML(comment['message']), cleanHTML(comment['user']['name'])))

  if(verbose):
    print(f'  ├─ {amount} comments')
    
  return commentList

In [11]:
def queryPostAsDataframe(url, verbose = True):
  postID = re.findall('[0-9]+', url)[-1]
  title = requestTitle(postID, verbose)
  comments = requestComments(postID, verbose)

  comments.insert(0, title)
  fr = np.array(comments)
  df = pd.DataFrame(fr, columns=['Message', 'Poster'])

  return df

In [12]:
def scrapFile(path, verbose = True, toCSV = False, outputDir = "./results"):
  if(pathlib.Path(path).suffix == '.csv'):
    src = pd.read_csv(path, header=None)
  else:
    src = pd.read_excel(path, header=None)
  
  totalLinks = len(src)
  dfs = []
  for i in src.index:
    url = src[0][i]
    if(verbose):
      print(f'Scanning ({i+1}/{totalLinks}) {url}...')
    timeStart = time.time()

    df = queryPostAsDataframe(url, verbose)
    if(toCSV):
      fname = f'{outputDir}/{i}.csv'
      if(not os.path.exists(outputDir)):
        os.makedirs(outputDir)
      
      df.to_csv(fname, index=False)
      if(verbose):
        elapsed = time.time() - timeStart
        print(f'  ╘══ ✓ Scrap complete in {elapsed:.2f}s, csv saved at {fname}')
    dfs.append(df)
  
  print(f'Scrap {totalLinks} links completed')
  return dfs

In [13]:
dfs = scrapFile('./pantip urls.xlsx', toCSV=True)

Scanning (1/5) https://pantip.com/topic/41506843...
  ├─ Post title : พนักงานที่ใจเย็น   คุมสติได้  หาทางแก้ปัญหาให้ลูกค้าที่ใจร้อนได้อย่างนุ่มนวล  เขาทำได้โดยธรรมชาติหรือต้องฝืนครับ
  ├─ 36 comments
  ╘══ ✓ Scrap complete in 3.78s, csv saved at ./results/0.csv
Scanning (2/5) https://pantip.com/topic/41497035...
  ├─ Post title : เรียนจบมายังไม่ถึงปีแต่ที่บ้านก็เร่งให้หางานทำ
  ├─ 149 comments
  ╘══ ✓ Scrap complete in 4.41s, csv saved at ./results/1.csv
Scanning (3/5) https://pantip.com/topic/41505149...
  ├─ Post title : ฉันไปเรียนขับรถยนต์มา แต่มันทำให้ฉันรู้สึกแย่
  ├─ 88 comments
  ╘══ ✓ Scrap complete in 1.75s, csv saved at ./results/2.csv
Scanning (4/5) https://pantip.com/topic/41505076...
  ├─ Post title : เเม่ไม่ยอมปล่อยลูกตามมาอยู่หอด้วย​ งงมาก
  ├─ 101 comments
  ╘══ ✓ Scrap complete in 0.64s, csv saved at ./results/3.csv
Scanning (5/5) https://pantip.com/topic/41507095...
  ├─ Post title : คบกับแฟนมา 3 เดือน พึ่งรู้ว่าแฟนเป็นโรคซึมเศร้า ควรจะไปต่อหรือพอแค่นี้
  ├─ 128 comme

In [14]:
# CLI usage
if(__name__ == '__main__'):
    parser = ArgumentParser()
    parser.add_argument('-i', help='Path to xlsx or csv file')
    parser.add_argument('-o', help='Output directory')
    parser.add_argument('--verbose', help='Print progress',
                        action="store_true")
    
    args = parser.parse_args()
    if args.i == None:
        print('Please enter input file')
        exit()
    ipath = args.i
    opath = './results' if args.o == None else args.o
    verbose = args.verbost != None
    scrapFile(ipath, verbose = verbose, to_csv = True, output_folder = opath)

usage: ipykernel_launcher.py [-h] [-i I] [-o O] [--verbose]
ipykernel_launcher.py: error: unrecognized arguments: --ip=127.0.0.1 --stdin=9003 --control=9001 --hb=9000 --Session.signature_scheme="hmac-sha256" --Session.key=b"8da83690-f327-4e88-a17c-cfe061d1563b" --shell=9002 --transport="tcp" --iopub=9004 --f=c:\Users\tanas\AppData\Roaming\jupyter\runtime\kernel-v2-17152Ne0n7h5f07Ov.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
