<a href="https://colab.research.google.com/github/NikkiYng/Ex1/blob/main/Scrape_weibo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Scrape tweets from Weibo**
Adapted from https://github.com/dataabc/weibo-search/tree/master/weibo

In [None]:
from google.colab import files
import pandas as pd
# Clone the github repository
! git clone https://github.com/dataabc/weibo-search.git
! pip install scrapy
%cd weibo-search
! pip install -r requirements.txt
import scrapy
import warnings
warnings.filterwarnings("ignore", category=scrapy.exceptions.ScrapyDeprecationWarning)

Cloning into 'weibo-search'...
remote: Enumerating objects: 505, done.[K
remote: Counting objects: 100% (186/186), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 505 (delta 164), reused 159 (delta 159), pack-reused 319[K
Receiving objects: 100% (505/505), 82.11 KiB | 1.71 MiB/s, done.
Resolving deltas: 100% (291/291), done.
/content/weibo-search/weibo-search/weibo-search/weibo-search


In [None]:
# Check the directory
!find . -name settings.py
!find . -name items.py

./weibo/settings.py
./weibo/items.py


In [None]:
# Update the item list
%%writefile ./weibo/items.py

import scrapy

class WeiboItem(scrapy.Item):
    # define the fields for your item here like:
    id = scrapy.Field()
    bid = scrapy.Field()
    user_id = scrapy.Field()
    screen_name = scrapy.Field()
    text = scrapy.Field()
    article_url = scrapy.Field()
    location = scrapy.Field()
    at_users = scrapy.Field()
    topics = scrapy.Field()
    reposts_count = scrapy.Field()
    comments_count = scrapy.Field()
    attitudes_count = scrapy.Field()
    created_at = scrapy.Field()
    source = scrapy.Field()
    pics = scrapy.Field()
    video_url = scrapy.Field()
    retweet_id = scrapy.Field()
    ip = scrapy.Field()

Overwriting ./weibo/items.py


In [None]:
# Update pipelines
%%writefile ./weibo/pipelines.py
# -*- coding: utf-8 -*-
import copy
import csv
import os

import scrapy
from scrapy.exceptions import DropItem
from scrapy.pipelines.files import FilesPipeline
from scrapy.pipelines.images import ImagesPipeline
from scrapy.utils.project import get_project_settings

settings = get_project_settings()

class CsvPipeline(object):
    def process_item(self, item, spider):
        base_dir = '结果文件' + os.sep + item['keyword']
        if not os.path.isdir(base_dir):
            os.makedirs(base_dir)
        file_path = base_dir + os.sep + item['keyword'] + '.csv'

        # Print the absolute path of the file
        print(f"Saving CSV file to: {os.path.abspath(file_path)}")

        if not os.path.isfile(file_path):
            is_first_write = 1
        else:
            is_first_write = 0
        if item:
            with open(file_path, 'a', encoding='utf-8-sig', newline='') as f:
                writer = csv.writer(f)
                if is_first_write:
                    header = [
                        'id', 'bid', 'user_id', '用户昵称', '微博正文', '头条文章url',
                        '发布位置', '艾特用户', '话题', '转发数', '评论数', '点赞数', '发布时间',
                        '发布工具', '微博图片url', '微博视频url', 'retweet_id', 'ip'
                    ]
                    writer.writerow(header)
                writer.writerow(
                    [item['weibo'][key] for key in item['weibo'].keys()])
        return item


class MyImagesPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        if len(item['weibo']['pics']) == 1:
            yield scrapy.Request(item['weibo']['pics'][0],
                                 meta={
                                     'item': item,
                                     'sign': ''
                                 })
        else:
            sign = 0
            for image_url in item['weibo']['pics']:
                yield scrapy.Request(image_url,
                                     meta={
                                         'item': item,
                                         'sign': '-' + str(sign)
                                     })
                sign += 1

    def file_path(self, request, response=None, info=None):
        image_url = request.url
        item = request.meta['item']
        sign = request.meta['sign']
        base_dir = '结果文件' + os.sep + item['keyword'] + os.sep + 'images'
        if not os.path.isdir(base_dir):
            os.makedirs(base_dir)
        image_suffix = image_url[image_url.rfind('.'):]
        file_path = base_dir + os.sep + item['weibo'][
            'id'] + sign + image_suffix
        return file_path


class MyVideoPipeline(FilesPipeline):
    def get_media_requests(self, item, info):
        if item['weibo']['video_url']:
            yield scrapy.Request(item['weibo']['video_url'],
                                 meta={'item': item})

    def file_path(self, request, response=None, info=None):
        item = request.meta['item']
        base_dir = '结果文件' + os.sep + item['keyword'] + os.sep + 'videos'
        if not os.path.isdir(base_dir):
            os.makedirs(base_dir)
        file_path = base_dir + os.sep + item['weibo']['id'] + '.mp4'
        return file_path


class MongoPipeline(object):
    def open_spider(self, spider):
        try:
            from pymongo import MongoClient
            self.client = MongoClient(settings.get('MONGO_URI'))
            self.db = self.client['weibo']
            self.collection = self.db['weibo']
        except ModuleNotFoundError:
            spider.pymongo_error = True

    def process_item(self, item, spider):
        try:
            import pymongo

            new_item = copy.deepcopy(item)
            if not self.collection.find_one({'id': new_item['weibo']['id']}):
                self.collection.insert_one(dict(new_item['weibo']))
            else:
                self.collection.update_one({'id': new_item['weibo']['id']},
                                           {'$set': dict(new_item['weibo'])})
        except pymongo.errors.ServerSelectionTimeoutError:
            spider.mongo_error = True

    def close_spider(self, spider):
        try:
            self.client.close()
        except AttributeError:
            pass


class MysqlPipeline(object):
    def create_database(self, mysql_config):
        """创建MySQL数据库"""
        import pymysql
        sql = """CREATE DATABASE IF NOT EXISTS %s DEFAULT
            CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci""" % settings.get(
            'MYSQL_DATABASE', 'weibo')
        db = pymysql.connect(**mysql_config)
        cursor = db.cursor()
        cursor.execute(sql)
        db.close()

    def create_table(self):
        """创建MySQL表"""
        sql = """
                CREATE TABLE IF NOT EXISTS weibo (
                id varchar(20) NOT NULL,
                bid varchar(12) NOT NULL,
                user_id varchar(20),
                screen_name varchar(30),
                text varchar(2000),
                article_url varchar(100),
                topics varchar(200),
                at_users varchar(1000),
                pics varchar(3000),
                video_url varchar(1000),
                location varchar(100),
                created_at DATETIME,
                source varchar(30),
                attitudes_count INT,
                comments_count INT,
                reposts_count INT,
                retweet_id varchar(20),
                PRIMARY KEY (id)
                ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"""
        self.cursor.execute(sql)

    def open_spider(self, spider):
        try:
            import pymysql
            mysql_config = {
                'host': settings.get('MYSQL_HOST', 'localhost'),
                'port': settings.get('MYSQL_PORT', 3306),
                'user': settings.get('MYSQL_USER', 'root'),
                'password': settings.get('MYSQL_PASSWORD', '123456'),
                'charset': 'utf8mb4'
            }
            self.create_database(mysql_config)
            mysql_config['db'] = settings.get('MYSQL_DATABASE', 'weibo')
            self.db = pymysql.connect(**mysql_config)
            self.cursor = self.db.cursor()
            self.create_table()
        except ImportError:
            spider.pymysql_error = True
        except pymysql.OperationalError:
            spider.mysql_error = True

    def process_item(self, item, spider):
        data = dict(item['weibo'])
        data['pics'] = ','.join(data['pics'])
        keys = ', '.join(data.keys())
        values = ', '.join(['%s'] * len(data))
        sql = """INSERT INTO {table}({keys}) VALUES ({values}) ON
                     DUPLICATE KEY UPDATE""".format(table='weibo',
                                                    keys=keys,
                                                    values=values)
        update = ','.join([" {key} = {key}".format(key=key) for key in data])
        sql += update
        try:
            self.cursor.execute(sql, tuple(data.values()))
            self.db.commit()
        except Exception:
            self.db.rollback()
        return item

    def close_spider(self, spider):
        try:
            self.db.close()
        except Exception:
            pass


class DuplicatesPipeline(object):
    def __init__(self):
        self.ids_seen = set()

    def process_item(self, item, spider):
        if item['weibo']['id'] in self.ids_seen:
            raise DropItem("过滤重复微博: %s" % item)
        else:
            self.ids_seen.add(item['weibo']['id'])
            return item


Overwriting ./weibo/pipelines.py


In [None]:
# Update the settings
%%writefile ./weibo/settings.py
# -*- coding: utf-8 -*-
BOT_NAME = 'weibo'
SPIDER_MODULES = ['weibo.spiders']
NEWSPIDER_MODULE = 'weibo.spiders'
COOKIES_ENABLED = False
TELNETCONSOLE_ENABLED = False
LOG_LEVEL = 'ERROR'


# 访问完一个页面再访问下一个时需要等待的时间默认为10秒
DOWNLOAD_DELAY = 5
DEFAULT_REQUEST_HEADERS = {
    'Accept':
    'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-US;q=0.7',
    'cookie': 'SUB=_2A25Iu6MbDeRhGeFN41AU8S3JyDWIHXVruLrTrDV6PUJbktANLRX6kW1NQ9mv_DFBIDpTroGQ3zPyon0jqQpKIgC8; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhnHG8Y_sC9S7JaSwY.Qyci5JpX5KzhUgL.FoM01hzfeKefe0.2dJLoI7f2Ugf7dsLV9c9LUJHE; SSOLoginState=1707070283; ALF=1709662283; _T_WM=0f39824fa71ba05027ffb18593e03fd8'
}
ITEM_PIPELINES = {
    'weibo.pipelines.DuplicatesPipeline': 300,
    'weibo.pipelines.CsvPipeline': 301
}


KEYWORD_LIST = ['单身即地狱']
WEIBO_TYPE = 1
CONTAIN_TYPE = 0
REGION = ['全部']
START_DATE = '2023-12-30'
END_DATE = '2024-02-01'
FURTHER_THRESHOLD = 46
# 图片文件存储路径
IMAGES_STORE = './'
# 视频文件存储路径
FILES_STORE = './'

Overwriting ./weibo/settings.py


In [None]:
# Run the code
! scrapy crawl search -s JOBDIR=crawls/search


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 'screen_name': 'Blingbling大',
 'source': '微博视频号',
 'text': '#单身即地狱##恋综#女神们颜值与身材同时在线展开游戏大比拼胜利者能前往天堂岛愉快度假LBlingbling大的微博视频',
 'topics': '单身即地狱,恋综',
 'user_id': '6101001140',
 'video_url': 'http://f.video.weibocdn.com/o0/tD3XIQZBlx08c4zMLM8801041202fUns0E010.mp4?label=mp4_720p&template=1024x576.25.0&ori=0&ps=1BVp4ysnknHVZu&Expires=1707155456&ssig=QrzYDHRvoe&KID=unistore,video'}
Saving CSV file to: /content/weibo-search/weibo-search/weibo-search/weibo-search/结果文件/单身即地狱/单身即地狱.csv
{'article_url': '',
 'at_users': '',
 'attitudes_count': '2',
 'bid': 'NBxELmqZf',
 'comments_count': '0',
 'created_at': '2024-01-12 21:44',
 'id': '4989475155346957',
 'ip': '江苏',
 'location': '',
 'pics': '',
 'reposts_count': '0',
 'retweet_id': '',
 'screen_name': '有洛',
 'source': '单身即地狱3超话',
 'text': '单身即地狱3深蹲是什么洗屁屁游戏笑死',
 'topics': '',
 'user_id': '3892017456',
 'video_url': ''}
Saving CSV file to: /content/weibo-search/weibo-search/weibo-sear

In [None]:
# Check results
df = pd.read_csv('/content/weibo-search/weibo-search/weibo-search/weibo-search/结果文件/单身即地狱/单身即地狱.csv') # this is my directory in google colab
text_df = df[['微博正文']]
print(text_df)

# Save DataFrame to a CSV file
text_df.to_csv('weibo.csv', index=False)

# Download the file tolocal computer
files.download('weibo.csv')


                                                  微博正文
0                          单身即地狱3第五期我真的笑喜不愧是韩女#单身即地狱3#
1                          前天一口气看了七期#单身即地狱3#什么时候更新啊着急看
2                                       看单身即地狱3好喜欢巧克力哥
3    1.1看了最新的单身即地狱吃了酸豆角炒肉好久没吃米饭了明天开始工作加油吃了褪黑素准备睡觉效果...
4                             一向不怎麼喜歡戀綜但單身即地獄3作為喜劇看挺不錯
..                                                 ...
925  #单身即地狱3#太抓马了哈哈哈哈哈第一次看韩综本来是陪老范看的结果也看的入迷绝对的情景喜剧官...
926  看了这么多恋综没有一个真的很讨厌的女主单身即地狱3的女一你做到了看完感觉被霸凌了从此想躲避韩...
927  #单身即地狱3#无聊找来看的，发现尹厦情果敢有主见有话直说不惯着藏着掖着真的太绝太洒了，很爱...
928  #单身即地狱3#太爽《单身即地狱3》女嘉宾好团结很少见到竞争向恋综有感情交叉的女嘉宾们不搞雌...
929  1⃣️尹大福每次去店里吹完毛就显得gang大一只，很蓬松很fat哈哈哈哈哈哈，但其实是一只小...

[930 rows x 1 columns]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>