# Import

In [1]:
import requests
import pandas as pd
import json
from tqdm.notebook import tqdm
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import pickle

# Single Search

In [2]:
import requests

headers = {
    'accept': 'application/json, text/plain, */*',
    'accept-language': 'en-US,en;q=0.9',
    'content-type': 'application/json;charset=UTF-8',
    'origin': 'https://pantip.com',
    'ptauthorize': 'Basic dGVzdGVyOnRlc3Rlcg==',
    'referer': 'https://pantip.com/',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
}

json_data = {
    'keyword': 'Passive Income', # Search by keyword
    'limit': 10,
    'type': 'all',
    'show_btn_search': False,
    # 'room_search': None,
}

response = requests.post('https://pantip.com/api/search-service/search/query', headers=headers, json=json_data)

In [3]:
response

<Response [200]>

In [4]:
response.json()

{'success': True,
 'data': [{'title': 'passive income',
   'type': 'member',
   'type_th': 'สมาชิก',
   'sub_type': None,
   'avatar': 'https://ptcdn.info/search/icon/member.png',
   'slug': None,
   'url': 'https://pantip.com/profile/368650',
   'forum_tag': None,
   'timestamp': None,
   'id': '368650'}]}

# Function

In [1]:
headers = {
    'accept': 'application/json, text/plain, */*',
    'accept-language': 'en-US,en;q=0.9',
    'content-type': 'application/json;charset=UTF-8',
    'origin': 'https://pantip.com',
    'priority': 'u=1, i',
    'ptauthorize': 'Basic dGVzdGVyOnRlc3Rlcg==',
    'referer': 'https://pantip.com/search?q=cat',
    'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"macOS"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
}

def fetch_page(i, keyword, headers):
    json_data = {
        'keyword': keyword,
        'page': i,
        'type': 'all',
        'show_btn_search': 'true',
        'room_search': None,
    }

    response = requests.post(
        'https://pantip.com/api/search-service/search/getresult',
        headers=headers,
        json=json_data,
    )
    
    if response.status_code == 200:
        return i, response.json()  
    else:
        return i, None  

#Parallelize fetching all pages using ThreadPoolExecutor we can define the number of workers
def fetch_all_pages(keyword, page_number, headers, num_workers):

    results_dict = {}

    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = {executor.submit(fetch_page, i, keyword, headers): i for i in range(page_number + 1)}
        
        for future in tqdm(as_completed(futures), total=page_number + 1):
            try:
                i, data = future.result()
                if data:
                    print(f"Page {i} processed successfully.")
                    results_dict[i] = data
                    #Add keyword as a key in the dictionary called 'search_keyword'
                    results_dict['search_keyword'] = keyword
                    
                    if 'data' in data and not data['data']:
                        print(f"Page {i} has no data. Stopping the loop.")
                        break
                else:
                    print(f"Failed to process page {i}.")
                    
            except Exception as e:
                print(f"Failed to process page {i}. Error: {e}")

            # Add delay for not getting kicked by the server AHAHAHAHAHAHAHA
            time.sleep(0.1)
    
    return results_dict

# Ingest

In [4]:
#read data from csv
early_df = pd.read_csv('your_file.csv')

In [5]:
keywords = early_df['word'].tolist()

In [6]:
# I organized the keywords into a list and query using the function to get the topic id
keywords

['Early retire',
 'เกษียณก่อนอายุ',
 'เออรี่รีไทร์',
 'เกษียณอายุน้อย',
 'ลาออกจากงาน',
 'เกษียณก่อนวัย']

In [7]:
len(keywords)

6

In [8]:
keywords[0]

'Early retire'

In [42]:
# Parameters
page_number = 37
num_workers = 10

# Fetch all pages for the first keyword
results_1 = fetch_all_pages(keywords[0], page_number, headers, num_workers)

  0%|          | 0/38 [00:00<?, ?it/s]

Page 0 processed successfully.
Page 2 processed successfully.
Page 1 processed successfully.
Page 4 processed successfully.
Page 5 processed successfully.
Page 6 processed successfully.
Page 3 processed successfully.
Page 8 processed successfully.
Page 10 processed successfully.
Page 7 processed successfully.
Page 9 processed successfully.
Page 11 processed successfully.
Page 15 processed successfully.
Page 14 processed successfully.
Page 19 processed successfully.
Page 17 processed successfully.
Page 13 processed successfully.
Page 18 processed successfully.
Page 16 processed successfully.
Page 21 processed successfully.
Page 12 processed successfully.
Page 23 processed successfully.
Page 22 processed successfully.
Page 20 processed successfully.
Page 27 processed successfully.
Page 24 processed successfully.
Page 26 processed successfully.
Page 25 processed successfully.
Page 28 processed successfully.
Page 29 processed successfully.
Page 32 processed successfully.
Page 30 processed 

In [43]:
#Parameters
page_number = 26
num_workers = 10


results_2 = fetch_all_pages(keywords[1], page_number, headers, num_workers)

  0%|          | 0/27 [00:00<?, ?it/s]

Page 0 processed successfully.
Page 1 processed successfully.
Page 4 processed successfully.
Page 6 processed successfully.
Page 2 processed successfully.
Page 3 processed successfully.
Page 8 processed successfully.
Page 7 processed successfully.
Page 10 processed successfully.
Page 9 processed successfully.
Page 18 processed successfully.
Page 17 processed successfully.
Page 13 processed successfully.
Page 14 processed successfully.
Page 16 processed successfully.
Page 5 processed successfully.
Page 15 processed successfully.
Page 12 processed successfully.
Page 11 processed successfully.
Page 20 processed successfully.
Page 21 processed successfully.
Page 23 processed successfully.
Page 24 processed successfully.
Page 26 processed successfully.
Page 25 processed successfully.
Page 19 processed successfully.
Page 22 processed successfully.


In [44]:
#Parameters
page_number = 18
num_workers = 10

results_3 = fetch_all_pages(keywords[2], page_number, headers, num_workers)

  0%|          | 0/19 [00:00<?, ?it/s]

Page 0 processed successfully.
Page 1 processed successfully.
Page 8 processed successfully.
Page 4 processed successfully.
Page 7 processed successfully.
Page 5 processed successfully.
Page 3 processed successfully.
Page 6 processed successfully.
Page 2 processed successfully.
Page 9 processed successfully.
Page 11 processed successfully.
Page 12 processed successfully.
Page 10 processed successfully.
Page 13 processed successfully.
Page 14 processed successfully.
Page 15 processed successfully.
Page 17 processed successfully.
Page 18 processed successfully.
Page 16 processed successfully.


In [35]:
# Parameters
page_number = 4
num_workers = 10

results_4 = fetch_all_pages(keywords[3], page_number, headers, num_workers)

  0%|          | 0/5 [00:00<?, ?it/s]

Page 0 processed successfully.
Page 1 processed successfully.
Page 4 processed successfully.
Page 2 processed successfully.
Page 3 processed successfully.


In [13]:
results_5 = fetch_all_pages(keywords[4], page_number, headers, num_workers)

  0%|          | 0/1001 [00:00<?, ?it/s]

Page 0 processed successfully.
Page 4 processed successfully.
Page 6 processed successfully.
Page 1 processed successfully.
Page 7 processed successfully.
Page 2 processed successfully.
Page 8 processed successfully.
Page 5 processed successfully.
Page 10 processed successfully.
Page 9 processed successfully.
Page 3 processed successfully.
Page 16 processed successfully.
Page 14 processed successfully.
Page 13 processed successfully.
Page 11 processed successfully.
Page 20 processed successfully.
Page 12 processed successfully.
Page 17 processed successfully.
Page 18 processed successfully.
Page 19 processed successfully.
Page 15 processed successfully.
Page 21 processed successfully.
Page 23 processed successfully.
Page 25 processed successfully.
Page 29 processed successfully.
Page 28 processed successfully.
Page 27 processed successfully.
Page 30 processed successfully.
Page 22 processed successfully.
Page 26 processed successfully.
Page 24 processed successfully.
Page 32 processed 

In [37]:
#Parameters
page_number = 15
num_workers = 10

results_6 = fetch_all_pages(keywords[5], page_number, headers, num_workers)

  0%|          | 0/16 [00:00<?, ?it/s]

Page 0 processed successfully.
Page 3 processed successfully.
Page 1 processed successfully.
Page 10 processed successfully.
Page 6 processed successfully.
Page 5 processed successfully.
Page 4 processed successfully.
Page 9 processed successfully.
Page 8 processed successfully.
Page 7 processed successfully.
Page 2 processed successfully.
Page 11 processed successfully.
Page 14 processed successfully.
Page 15 processed successfully.
Page 12 processed successfully.
Page 13 processed successfully.


In [55]:
#Combine all the results into a list
all_dict_1 = [results_1, results_2, results_3, results_4, results_5, results_6]

In [56]:
#Combine all the results into a single dictionary
combine_dict_1 = {}

for i, result in enumerate(all_dict_1):
    combine_dict_1[i] = result

In [58]:
import pickle

save_path = 'your_path_to_save' # Don't forget to change the name of the file

with open(save_path, 'wb') as f:
    pickle.dump(combine_dict_1, f)