In [5]:
# 程序功能: 按关键字爬取微博清单
import os
import re  # 正则表达式提取文本
from jsonpath import jsonpath  # 解析json数据
import requests  # 发送请求
import pandas as pd  # 存取csv文件
import datetime  # 转换时间用

# 请求头
headers = {
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Mobile Safari/537.36",
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "accept-encoding": "gzip, deflate, br",
}


def trans_time(v_str):
    """转换GMT时间为标准格式"""
    GMT_FORMAT = '%a %b %d %H:%M:%S +0800 %Y'
    timeArray = datetime.datetime.strptime(v_str, GMT_FORMAT)
    ret_time = timeArray.strftime("%Y-%m-%d %H:%M:%S")
    return ret_time


def getLongText(v_id):
    """爬取长微博全文"""
    url = 'https://m.weibo.cn/statuses/extend?id=' + str(v_id)
    r = requests.get(url, headers=headers)
    json_data = r.json()
    long_text = json_data['data']['longTextContent']
    # 微博内容-正则表达式数据清洗
    dr = re.compile(r'<[^>]+>', re.S)
    long_text2 = dr.sub('', long_text)
    # print(long_text2)
    return long_text2


def get_weibo_list(v_keyword, v_max_page):
    """
    爬取微博内容列表
    :param v_keyword: 搜索关键字
    :param v_max_page: 爬取前几页
    :return: None
    """
    for page in range(2, v_max_page + 1):
        print('===开始爬取第{}页微博==='.format(page))
        # 请求地址
        url = 'https://m.weibo.cn/api/container/getIndex'
        # 请求参数
        params = {
            "containerid": "100103type=1&q={}".format(v_keyword),
            "page_type": "searchall",
            "page": page
        }
        # 发送请求
        r = requests.get(url, headers=headers, params=params)
        print(r.status_code)
        # pprint(r.json())
        # 解析json数据
        cards = r.json()["data"]["cards"]
        print(len(cards))
        region_name_list = []
        status_city_list = []
        status_province_list = []
        status_country_list = []
        for card in cards:
            # 发布于
            try:
                region_name = card['card_group'][0]['mblog']['region_name']
                region_name_list.append(region_name)
            except:
                region_name_list.append('')
            # ip属地_城市
            try:
                status_city = card['card_group'][0]['mblog']['status_city']
                status_city_list.append(status_city)
            except:
                status_city_list.append('')
            # ip属地_省份
            try:
                status_province = card['card_group'][0]['mblog']['status_province']
                status_province_list.append(status_province)
            except:
                status_province_list.append('')
            #ip属地_国家
            try:
                status_country = card['card_group'][0]['mblog']['status_country']
                status_country_list.append(status_country)
            except:
                status_country_list.append('')
        # 微博内容
        text_list = jsonpath(cards, '$..mblog.text')
        # 微博内容-正则表达式数据清洗
        dr = re.compile(r'<[^>]+>', re.S)
        text2_list = []
        print('text_list is:')
        # print(text_list)
        if not text_list:  # 如果未获取到微博内容，进入下一轮循环
            continue
        if type(text_list) == list and len(text_list) > 0:
            for text in text_list:
                text2 = dr.sub('', text)  # 正则表达式提取微博内容
                # print(text2)
                text2_list.append(text2)
        # 微博创建时间
        time_list = jsonpath(cards, '$..mblog.created_at')
        time_list = [trans_time(v_str=i) for i in time_list]
        # 微博作者
        author_list = jsonpath(cards, '$..mblog.user.screen_name')
        # 微博id
        id_list = jsonpath(cards, '$..mblog.id')
        # 判断是否存在全文
        isLongText_list = jsonpath(cards, '$..mblog.isLongText')
        idx = 0
        for i in isLongText_list:
            if i == True:
                long_text = getLongText(v_id=id_list[idx])
                text2_list[idx] = long_text
            idx += 1
        # 转发数
        reposts_count_list = jsonpath(cards, '$..mblog.reposts_count')
        # 评论数
        comments_count_list = jsonpath(cards, '$..mblog.comments_count')
        # 点赞数
        attitudes_count_list = jsonpath(cards, '$..mblog.attitudes_count')
        # 把列表数据保存成DataFrame数据
        print('id_list:',len(id_list))
        print(len(time_list))
        print('region_name_list:',len(region_name_list))
        print(len(status_city_list))
        print(len(status_province_list))
        print(len(status_country_list))

        df = pd.DataFrame(
            {
                '页码': [page] * len(id_list),
                '微博id': id_list,
                '微博作者': author_list,
                '发布时间': time_list,
                '微博内容': text2_list,
                '转发数': reposts_count_list,
                '评论数': comments_count_list,
                '点赞数': attitudes_count_list,
                '发布于': region_name_list,
                'ip属地_城市': status_city_list,
                'ip属地_省份': status_province_list,
                'ip属地_国家': status_country_list,
            }
        )
        # 表头
        if os.path.exists(v_weibo_file):
            header = None
        else:
            header = ['页码', '微博id', '微博作者', '发布时间', '微博内容', '转发数', '评论数', '点赞数', '发布于','ip属地_城市','ip属地_省份','ip属地_国家']  # csv文件头
        # 保存到csv文件
        df.to_csv(v_weibo_file, mode='a+', index=False, header=header, encoding='utf_8_sig')
        print('csv保存成功:{}'.format(v_weibo_file))


if __name__ == '__main__':
    # 爬取前几页
    max_search_page = 1000  # 爬前n页
    # 爬取关键字
    search_keyword = '消费降级'
    # 保存文件名
    v_weibo_file = '微博清单_{}_前{}页.csv'.format(search_keyword, max_search_page)
    # 如果csv文件存在，先删除之
    if os.path.exists(v_weibo_file):
        os.remove(v_weibo_file)
        print('微博清单存在，已删除: {}'.format(v_weibo_file))
    # 调用爬取微博函数
    get_weibo_list(v_keyword=search_keyword, v_max_page=max_search_page)
    # 数据清洗-去重
    df = pd.read_csv(v_weibo_file)
    # 删除重复数据
    df.drop_duplicates(subset=['微博id'], inplace=True, keep='first')
    # 再次保存csv文件
    df.to_csv(v_weibo_file, index=False, encoding='utf_8_sig')
    print('数据清洗完成')


===开始爬取第2页微博===
200
10
text_list is:
id_list: 10
10
region_name_list: 10
10
10
10
csv保存成功:微博清单_消费降级_前1000页.csv
===开始爬取第3页微博===
200
10
text_list is:
id_list: 10
10
region_name_list: 10
10
10
10
csv保存成功:微博清单_消费降级_前1000页.csv
===开始爬取第4页微博===
200
10
text_list is:
id_list: 10
10
region_name_list: 10
10
10
10
csv保存成功:微博清单_消费降级_前1000页.csv
===开始爬取第5页微博===
200
9
text_list is:
id_list: 9
9
region_name_list: 9
9
9
9
csv保存成功:微博清单_消费降级_前1000页.csv
===开始爬取第6页微博===
200
10
text_list is:
id_list: 10
10
region_name_list: 10
10
10
10
csv保存成功:微博清单_消费降级_前1000页.csv
===开始爬取第7页微博===
200
10
text_list is:
id_list: 10
10
region_name_list: 10
10
10
10
csv保存成功:微博清单_消费降级_前1000页.csv
===开始爬取第8页微博===
200
10
text_list is:
id_list: 10
10
region_name_list: 10
10
10
10
csv保存成功:微博清单_消费降级_前1000页.csv
===开始爬取第9页微博===
200
10
text_list is:
id_list: 10
10
region_name_list: 10
10
10
10
csv保存成功:微博清单_消费降级_前1000页.csv
===开始爬取第10页微博===
200
10
text_list is:
id_list: 10
10
region_name_list: 10
10
10
10
csv保存成功:微博清单_消费降级_前1000页.csv
===开始爬取

200
10
text_list is:
id_list: 10
10
region_name_list: 10
10
10
10
csv保存成功:微博清单_消费降级_前1000页.csv
===开始爬取第77页微博===
200
9
text_list is:
id_list: 9
9
region_name_list: 9
9
9
9
csv保存成功:微博清单_消费降级_前1000页.csv
===开始爬取第78页微博===
200
10
text_list is:
id_list: 10
10
region_name_list: 10
10
10
10
csv保存成功:微博清单_消费降级_前1000页.csv
===开始爬取第79页微博===
200
6
text_list is:
id_list: 6
6
region_name_list: 6
6
6
6
csv保存成功:微博清单_消费降级_前1000页.csv
===开始爬取第80页微博===
200
0
text_list is:
===开始爬取第81页微博===
200
0
text_list is:
===开始爬取第82页微博===
200
0
text_list is:
===开始爬取第83页微博===
200
0
text_list is:
===开始爬取第84页微博===
200
0
text_list is:
===开始爬取第85页微博===
200
0
text_list is:
===开始爬取第86页微博===
200
0
text_list is:
===开始爬取第87页微博===
200
0
text_list is:
===开始爬取第88页微博===
200
0
text_list is:
===开始爬取第89页微博===
200
0
text_list is:
===开始爬取第90页微博===
200
0
text_list is:
===开始爬取第91页微博===
200
0
text_list is:
===开始爬取第92页微博===
200
0
text_list is:
===开始爬取第93页微博===
200
0
text_list is:
===开始爬取第94页微博===
200
0
text_list is:
===开始爬取第95页微博===
200
0
text_

200
0
text_list is:
===开始爬取第286页微博===
200
0
text_list is:
===开始爬取第287页微博===
200
0
text_list is:
===开始爬取第288页微博===
200
0
text_list is:
===开始爬取第289页微博===
200
0
text_list is:
===开始爬取第290页微博===
200
0
text_list is:
===开始爬取第291页微博===
200
0
text_list is:
===开始爬取第292页微博===
200
0
text_list is:
===开始爬取第293页微博===
200
0
text_list is:
===开始爬取第294页微博===
200
0
text_list is:
===开始爬取第295页微博===
200
0
text_list is:
===开始爬取第296页微博===
200
0
text_list is:
===开始爬取第297页微博===
200
0
text_list is:
===开始爬取第298页微博===
200
0
text_list is:
===开始爬取第299页微博===
200
0
text_list is:
===开始爬取第300页微博===
200
0
text_list is:
===开始爬取第301页微博===
200
0
text_list is:
===开始爬取第302页微博===
200
0
text_list is:
===开始爬取第303页微博===
200
0
text_list is:
===开始爬取第304页微博===
200
0
text_list is:
===开始爬取第305页微博===
200
0
text_list is:
===开始爬取第306页微博===
200
0
text_list is:
===开始爬取第307页微博===
200
0
text_list is:
===开始爬取第308页微博===
200
0
text_list is:
===开始爬取第309页微博===
200
0
text_list is:
===开始爬取第310页微博===
200
0
text_list is:
===开始爬取第311页微博===
200
0
text_l

200
0
text_list is:
===开始爬取第502页微博===
200
0
text_list is:
===开始爬取第503页微博===
200
0
text_list is:
===开始爬取第504页微博===
200
0
text_list is:
===开始爬取第505页微博===
200
0
text_list is:
===开始爬取第506页微博===
200
0
text_list is:
===开始爬取第507页微博===
200
0
text_list is:
===开始爬取第508页微博===
200
0
text_list is:
===开始爬取第509页微博===
200
0
text_list is:
===开始爬取第510页微博===
200
0
text_list is:
===开始爬取第511页微博===
200
0
text_list is:
===开始爬取第512页微博===
200
0
text_list is:
===开始爬取第513页微博===
200
0
text_list is:
===开始爬取第514页微博===
200
0
text_list is:
===开始爬取第515页微博===
200
0
text_list is:
===开始爬取第516页微博===
200
0
text_list is:
===开始爬取第517页微博===
200
0
text_list is:
===开始爬取第518页微博===
200
0
text_list is:
===开始爬取第519页微博===
200
0
text_list is:
===开始爬取第520页微博===
200
0
text_list is:
===开始爬取第521页微博===
200
0
text_list is:
===开始爬取第522页微博===
200
0
text_list is:
===开始爬取第523页微博===
200
0
text_list is:
===开始爬取第524页微博===
200
0
text_list is:
===开始爬取第525页微博===
200
0
text_list is:
===开始爬取第526页微博===
200
0
text_list is:
===开始爬取第527页微博===
200
0
text_l

200
0
text_list is:
===开始爬取第718页微博===
200
0
text_list is:
===开始爬取第719页微博===
200
0
text_list is:
===开始爬取第720页微博===
200
0
text_list is:
===开始爬取第721页微博===
200
0
text_list is:
===开始爬取第722页微博===
200
0
text_list is:
===开始爬取第723页微博===
200
0
text_list is:
===开始爬取第724页微博===
200
0
text_list is:
===开始爬取第725页微博===
200
0
text_list is:
===开始爬取第726页微博===
200
0
text_list is:
===开始爬取第727页微博===
200
0
text_list is:
===开始爬取第728页微博===
200
0
text_list is:
===开始爬取第729页微博===
200
0
text_list is:
===开始爬取第730页微博===
200
0
text_list is:
===开始爬取第731页微博===
200
0
text_list is:
===开始爬取第732页微博===
200
0
text_list is:
===开始爬取第733页微博===
200
0
text_list is:
===开始爬取第734页微博===
200
0
text_list is:
===开始爬取第735页微博===
200
0
text_list is:
===开始爬取第736页微博===
200
0
text_list is:
===开始爬取第737页微博===
200
0
text_list is:
===开始爬取第738页微博===
200
0
text_list is:
===开始爬取第739页微博===
200
0
text_list is:
===开始爬取第740页微博===
200
0
text_list is:
===开始爬取第741页微博===
200
0
text_list is:
===开始爬取第742页微博===
200
0
text_list is:
===开始爬取第743页微博===
200
0
text_l

200
0
text_list is:
===开始爬取第935页微博===
200
0
text_list is:
===开始爬取第936页微博===
200
0
text_list is:
===开始爬取第937页微博===
200
0
text_list is:
===开始爬取第938页微博===
200
0
text_list is:
===开始爬取第939页微博===
200
0
text_list is:
===开始爬取第940页微博===
200
0
text_list is:
===开始爬取第941页微博===
200
0
text_list is:
===开始爬取第942页微博===
200
0
text_list is:
===开始爬取第943页微博===
200
0
text_list is:
===开始爬取第944页微博===
200
0
text_list is:
===开始爬取第945页微博===
200
0
text_list is:
===开始爬取第946页微博===
200
0
text_list is:
===开始爬取第947页微博===
200
0
text_list is:
===开始爬取第948页微博===
200
0
text_list is:
===开始爬取第949页微博===
200
0
text_list is:
===开始爬取第950页微博===
200
0
text_list is:
===开始爬取第951页微博===
200
0
text_list is:
===开始爬取第952页微博===
200
0
text_list is:
===开始爬取第953页微博===
200
0
text_list is:
===开始爬取第954页微博===
200
0
text_list is:
===开始爬取第955页微博===
200
0
text_list is:
===开始爬取第956页微博===
200
0
text_list is:
===开始爬取第957页微博===
200
0
text_list is:
===开始爬取第958页微博===
200
0
text_list is:
===开始爬取第959页微博===
200
0
text_list is:
===开始爬取第960页微博===
200
0
text_l