In [1]:
import joblib
import os
import re
from bs4 import BeautifulSoup
from threading import Lock
import pandas as pd
from time import sleep
import requests
import json
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed

print_lock = Lock()

In [2]:
def split_series(series, n):

    total_elements = len(series)
    elements_per_partition, remainder = divmod(total_elements, n)

    start = 0
    partitions = []

    for i in range(n):
        end = start + elements_per_partition + (1 if i < remainder else 0)
        partition = series[start:end]
        partitions.append(partition)
        start = end

    return partitions


def scrape_data(uid, time_range, header):
    global weibo
    global failed
    global tasklength
    global finished

    constraint = '&hasori=1&hasret=1&hastext=1&haspic=1&hasvideo=1&hasmusic=1'
    for time_ in time_range:
        try:
            page = 1
            url = f'https://weibo.com/ajax/statuses/searchProfile?uid={uid}&page={page}&starttime={int(time_.timestamp())}&endtime={int(time_.timestamp()) + 86400}{constraint}'
            not_connected = True
            tried = 0
            while not_connected and tried < 3:
                try:
                    response = requests.get(url, headers=header)
                    not_connected = False
                except:
                    tried += 1
                    sleep(20)
            if tried >= 3:
                raise ValueError("....")

            if response.status_code != 200:
                raise ValueError("....")

            data = json.loads(response.content.decode('utf-8'))

            temp = []
            for blog in data['data']['list']:
                temp.append({'用户名': blog['user']['screen_name'],
                             '发表时间': blog['created_at'],
                             '发表内容': blog['text_raw'].replace('\u200b\u200b\u200b', '')})

            this_page_collected = len(temp)
            while this_page_collected == 20:
                this_page = []
                page += 1
                url = url.replace(f'page={page-1}', f'page={page}')
                sleep(5)
                not_connected = True
                tried = 0
                while not_connected and tried < 3:
                    try:
                        response = requests.get(url, headers=header)
                        not_connected = False
                    except:
                        tried += 1
                        sleep(20)

                if response.status_code != 200:
                    raise ValueError("....")

                data = json.loads(response.content.decode('utf-8'))

                for blog in data['data']['list']:
                    this_page.append({'用户名': blog['user']['screen_name'],
                                 '发表时间': blog['created_at'],
                                 '发表内容': blog['text_raw'].replace('\u200b\u200b\u200b', '')})
                temp += this_page
                this_page_collected = len(this_page)

            weibo += temp
            finished += 1
            with print_lock:
                print(f'\033[92mFinished\033[0m : {finished}/{tasklength} \033[91mFailed\033[0m : {len(failed)}', end='\r')
            sleep(5)

        except:
            finished += 1
            failed.append(time_)
            with print_lock:
                print(f'\033[92mFinished\033[0m : {finished}/{tasklength} \033[91mFailed\033[0m : {len(failed)}', end='\r')
            sleep(10)
            
    return None


def get_weibo_parallel(uid, splitted, headers_list):
    global weibo
    global failed
    global finished
    global tasklength

    headers_count = len(headers_list)

    with ThreadPoolExecutor(max_workers=headers_count) as executor:
        futures = [executor.submit(scrape_data, uid, splitted[i], headers_list[i]) for i in range(headers_count)]

        for future in as_completed(futures):
            future.result()
            
    print(f'\033[92mFinished\033[0m : {finished}/{tasklength} \033[91mFailed\033[0m : {len(failed)}')

    return None

新浪微博搜索用户微博的源url为：
    
    https://weibo.com/ajax/statuses/searchProfile
    
query参数有(这里时间是需要时间戳形式的)：

```url
?uid={uid}&page=1&starttime={}&endtime={}&hasori=1&hasret=1&hastext=1&haspic=1&hasvideo=1&hasmusic=1
```
---

新浪微博搜索关键词返回的html在这个url下：

    https://s.weibo.com/weibo
    
query参数有：

```url
?q={关键词}&category=4&suball=1&timescope=custom%3A{2024-01-31}%3A{2024-01-31}&Refer=g&page=1
```

**账号 UID**


- 东方财富网 ： 1801487174

- 财联社ap ： 2868676035

## 单个用户

In [3]:
headers = []

for file in os.listdir('./headers/'):
    headers.append(joblib.load('./headers/' + file))

In [4]:
weibo = []
failed = []

UID = 2813700891
user_name = '微博基金'
begin_time = '2024-01-21'
end_time = '2024-01-31'
full_time_range = pd.date_range(begin_time, end_time)

splitted = split_series(full_time_range, len(headers))

tasklength = len(full_time_range)
finished = 0

# collecting
start = datetime.now()
get_weibo_parallel(UID, splitted, headers)
end = datetime.now()
print(end-start, ' '.ljust(60))

weibo = pd.DataFrame(weibo)
# weibo['发表时间'] = pd.to_datetime(weibo['发表时间'], format="%a %b %d %H:%M:%S %z %Y")
# weibo.sort_values(by='发表时间').to_csv(f'./data/{user_name}.csv', index=False)

[92mFinished[0m : 11/11 [91mFailed[0m : 0
0:00:44.524156                                                             


In [5]:
weibo

Unnamed: 0,用户名,发表时间,发表内容
0,微博基金,Sun Jan 28 16:30:17 +0800 2024,#基金##基金[超话]# 大消息来了\n\nhttp://t.cn/A6j9N6N9
1,微博基金,Mon Jan 22 19:38:25 +0800 2024,#周小平 A股不涨比涨要安全##基金[超话]##基金# 你觉得不涨是安全的么？具体详见->h...
2,微博基金,Mon Jan 22 19:31:16 +0800 2024,[浪]基金大赛如火如荼进行中，萎靡市场下，仍有牛人收益超13%。你觉得你的收益能跑赢他们么？...
3,微博基金,Mon Jan 22 19:14:38 +0800 2024,此雪球非彼雪球//@北漂民工的日常:#申万回应江疏影雪球爆仓来到公司# 已经记不清这是雪球第...
4,微博基金,Mon Jan 22 18:51:36 +0800 2024,你好，基民们[泪][泪][泪]
...,...,...,...
104,微博基金,Fri Jan 26 14:00:14 +0800 2024,#基金##a股# 大盘2900问题不大了，现在到了加仓的时候了么？
105,微博基金,Fri Jan 26 13:34:12 +0800 2024,#A股##基金# 【#基金两年亏掉1.7万亿#，谁来为基民负责？】国内有100多家公募基金公...
106,微博基金,Fri Jan 26 12:00:59 +0800 2024,#基金# 微博基金今日实时热搜榜（截至20240126 12:00） 点击查看详情>> ht...
107,微博基金,Fri Jan 26 10:01:27 +0800 2024,#基金#【银行板块异动拉升 #重庆银行涨停#】 银行板块异动拉升，$重庆银行 sh60196...
