In [158]:
import os
import pandas as pd
from rich.progress import track
from tqdm import tqdm
from utils.__init__ import *
import logging
import warnings
import time

In [2]:
logging.basicConfig(level=logging.INFO)
warnings.filterwarnings("ignore", category=FutureWarning)

In [72]:
def merge_csv_files(input_folder, output_file):
    """
    将指定文件夹中的所有 CSV 文件读入并合并保存为一个新的 CSV 文件。
    
    参数:
    input_folder (str): 包含 CSV 文件的文件夹路径
    output_file (str): 合并后的输出 CSV 文件路径
    """
    csv_files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]
    
    # 检查是否存在 CSV 文件
    if not csv_files:
        print(f"没有找到 CSV 文件在文件夹: {input_folder}")
        return
    
    dataframes = []
    
    # 逐个读取 CSV 文件并添加到列表中
    for file in csv_files:
        file_path = os.path.join(input_folder, file)
        df = pd.read_csv(file_path)
        dataframes.append(df)
    
    # 如果输出文件已存在，将其加入合并列表中
    # if os.path.exists(output_file):
    #     exist_df = pd.read_csv(output_file)
    #     dataframes.append(exist_df)

    # 合并所有 DataFrame
    # combined_df = pd.concat(dataframes, ignore_index=True)
    # combined_df.reset_index(drop=True)
    # combined_df = combined_df.drop_duplicates()
    combined_df = pd.concat(dataframes).reset_index(drop=True).drop_duplicates()
    combined_df = combined_df.apply(lambda col: col.fillna(0).astype(int) if col.dtype=='float64' else col)
    if "Unnamed: 0" in combined_df.columns:
        combined_df = combined_df.drop(columns=["Unnamed: 0"])
    print(f"There are {len(combined_df)} comments in total.")
    # 保存合并后的 DataFrame 到新的 CSV 文件
    combined_df.to_csv(output_file, index=False, encoding="utf_8_sig")
    # print(f"所有 CSV 文件已合并并保存为 {output_file}")
    return len(combined_df)

### 爬虫信息解析类，用于获取微博、一级评论、二级评论

In [17]:
class WBParser:
    def __init__(self, cookie, folder="WBData"):
        self.cookie = cookie
        # 创建目录结构
        self._create_directories(folder)
        self.main_body_filepath = f"./{folder}/demo.csv"
        self.comments_level_1_filename = f"./{folder}/demo_comments_one.csv"
        self.comments_level_2_filename = f"./{folder}/demo_comments_two.csv"
        self.comments_level_1_dirpath = f"./{folder}/Comments_level_1/"
        self.comments_level_2_dirpath = f"./{folder}/Comments_level_2/"
        # logging.info(f"Successfully created root path: {folder}!")
        logging.info(f"Start parsing topic: {folder}!")

    def _create_directories(self, folder):
        """Helper function to create necessary directories."""
        os.makedirs(f"./{folder}", exist_ok=True)
        os.makedirs(f"./{folder}/Comments_level_1", exist_ok=True)
        os.makedirs(f"./{folder}/Comments_level_2", exist_ok=True)

    def get_main_body(self, q, kind):
        """Fetch main body and save it to CSV."""
        try:
            data = get_all_main_body(q, kind, self.cookie)
            data = data.reset_index(drop=True).astype(str).drop_duplicates()
            data.to_csv(self.main_body_filepath, encoding="utf_8_sig")
            logging.info(f"Main body data saved to {self.main_body_filepath}.")
        except Exception as e:
            logging.error(f"Error fetching main body: {e}")
        

    def get_comments_level_one(self):
        """Fetch level 1 comments and save them to CSV and folder."""
        data_list = []
        if not os.path.exists(self.main_body_filepath):
            logging.error("Main body not found, please fetch it first!")
            return

        main_body = pd.read_csv(self.main_body_filepath, index_col=0)
        logging.info(f"Found {main_body.shape[0]} main body entries, starting level 1 comments parsing...")

        try:
            with tqdm(total=len(range(main_body.shape[0])), desc="Parsing main body") as pbar:
                for ix in range(main_body.shape[0]):
                    # pbar.set_description(f"Processing {main_body.iloc[ix]["mid"]}")
                # for ix in tqdm(range(main_body.shape[0]), desc="Parsing main body"):
                    uid = int(float(main_body.iloc[ix]["uid"]))
                    mid = int(float(main_body.iloc[ix]["mid"]))
                    final_file_path = f"{self.comments_level_1_dirpath}{uid}_{mid}.csv"
                    # print(main_body.iloc[ix]["mid"], 1)

                    is_exist = self._check_existing_file(final_file_path)
                    pbar.set_description(f"Processing {main_body.iloc[ix]["mid"]}, exists={is_exist}")
                    # 如果文件已经存在且不为空，跳过
                    if not is_exist:
                        # print(main_body.iloc[ix]["mid"], 2)
                        data = get_all_level_one(uid=uid, mid=mid, cookie=self.cookie)
                        # print(main_body.iloc[ix]["mid"], 3)
                        data.drop_duplicates(inplace=True)
                        data.to_csv(final_file_path, encoding="utf_8_sig")
                        # print(main_body.iloc[ix]["mid"], 4)
                        data_list.append(data)
                    pbar.update(1)

            logging.info(f"Completed parsing all {main_body.shape[0]} main body entries for level 1 comments.")
        except Exception as e:
            logging.error(f"Error during level 1 comments parsing: {e}")

        self._save_combined_data(data_list, self.comments_level_1_filename)

    def get_comments_level_two(self):
        """Fetch level 2 comments and save them to CSV and folder."""
        data_list = []
        comments_level_1_data = self._load_level_1_comments()
        if comments_level_1_data is None:
            logging.error("Level 1 comments not found, please fetch them first!")
            return

        logging.info(f"Found {comments_level_1_data.shape[0]} level 1 comments, starting level 2 parsing...")

        try:
            # for ix in tqdm(range(comments_level_1_data.shape[0]), desc="Parsing level 1 comments"):
            with tqdm(total=comments_level_1_data.shape[0], desc="解析一级评论") as pbar:
                for ix in range(comments_level_1_data.shape[0]):
                    pbar.set_description(f"processing {comments_level_1_data.iloc[ix]['mid']}")
                    main_body_uid = int(float(comments_level_1_data.iloc[ix]["main_body_uid"]))
                    mid = int(float(comments_level_1_data.iloc[ix]["mid"]))
                    final_file_path = f"{self.comments_level_2_dirpath}{main_body_uid}_{mid}.csv"
                    # 如果文件已经存在且不为空，跳过
                    if not self._check_existing_file(final_file_path):
                        data = get_all_level_two(uid=main_body_uid, mid=mid, cookie=self.cookie)
                        data.drop_duplicates(inplace=True)
                        data.to_csv(final_file_path, encoding="utf_8_sig")
                        data_list.append(data)
                    pbar.update(1)

            logging.info(f"Completed parsing all {comments_level_1_data.shape[0]} level 1 comments for level 2 comments.")
        except Exception as e:
            logging.error(f"Error during level 2 comments parsing: {e}")

        self._save_combined_data(data_list, self.comments_level_2_filename)

    def _load_level_1_comments(self):
        """Load level 1 comments from the CSV file or directory."""
        try:
            if os.path.exists(self.comments_level_1_filename):
                return pd.read_csv(self.comments_level_1_filename, index_col=0)
            else:
                file_list = [self.comments_level_1_dirpath + item for item in os.listdir(self.comments_level_1_dirpath) if item.endswith('.csv')]
                if file_list:
                    return pd.concat([pd.read_csv(file) for file in file_list]).reset_index(drop=True).astype(str).drop_duplicates()
                else:
                    return None
        except Exception as e:
            logging.error(f"Error loading level 1 comments: {e}")
            return None

    def _check_existing_file(self, filepath):
        """Check if a CSV file exists and is not empty."""
        if os.path.exists(filepath):
            length = pd.read_csv(filepath).shape[0]
            if length > 0:
                # logging.info(f"Skipping existing file: {filepath}, already has {length} entries.")
                return True
        return False

    def _save_combined_data(self, data_list, filename):
        """Save combined data from the list to a CSV file."""
        if data_list:
            data = pd.concat(data_list).reset_index(drop=True).astype(str).drop_duplicates()
            data.to_csv(filename, encoding='utf_8_sig')
            logging.info(f"Data successfully saved to {filename}.")
        else:
            logging.warning(f"No data to save to {filename}.")

    


### 对指定主题进行爬虫

In [5]:
themes = [
    "#胖猫姐姐刘某操控舆论#",
    "#胖猫事件带给我们的反思才刚刚开始#",
    "#胖猫案细节公布#",
    "#胖猫事件#",
    "#警方通报胖猫事件调查情况#",
    "#胖猫谭竹#"
]
kind = "综合"  # 综合，实时，热门，高级
cookie = ""

In [36]:
for q in themes:
    wbparser = WBParser(cookie, q)
    # 获取主题内容
    print("Main body")
    wbparser.get_main_body(q, kind)
    # 获取一级评论
    print("Level 1 comments")
    wbparser.get_comments_level_one()
    # 获取二级评论
    print("Level 2 comments")
    wbparser.get_comments_level_two()
    
    print("1 minutes break")
    for i in tqdm(range(60)):
        time.sleep(1)

INFO:root:Successfully created root path: #胖猫姐姐刘某操控舆论#!


Main body


INFO:root:话题：#胖猫姐姐刘某操控舆论#，类型：综合，解析成功，一共有42页，准备开始解析...
解析中...: 100%|██████████| 41/41 [00:20<00:00,  2.00it/s]
INFO:root:话题：#胖猫姐姐刘某操控舆论#，类型：综合，一共有42页，已经解析完毕！
INFO:root:Main body data saved to ./#胖猫姐姐刘某操控舆论#/demo.csv.
INFO:root:Found 414 main body entries, starting level 1 comments parsing...


Level 1 comments


Parsing main body: 100%|██████████| 414/414 [41:22<00:00,  6.00s/it]   
INFO:root:Completed parsing all 414 main body entries for level 1 comments.
INFO:root:Data successfully saved to ./#胖猫姐姐刘某操控舆论#/demo_comments_one.csv.
INFO:root:Found 6663 level 1 comments, starting level 2 parsing...


Level 2 comments


Parsing level 1 comments:   0%|          | 2/6663 [16:34<920:23:29, 497.43s/it] 
ERROR:root:Error during level 2 comments parsing: 解析页面失败，请检查你的cookie是否正确！
INFO:root:Data successfully saved to ./#胖猫姐姐刘某操控舆论#/demo_comments_two.csv.


1 minutes break


100%|██████████| 60/60 [01:00<00:00,  1.00s/it]
INFO:root:Successfully created root path: #胖猫事件带给我们的反思才刚刚开始#!


Main body


INFO:root:话题：#胖猫事件带给我们的反思才刚刚开始#，类型：综合，解析成功，一共有 8页，准备开始解析...
解析中...: 100%|██████████| 7/7 [00:03<00:00,  2.20it/s]
INFO:root:话题：#胖猫事件带给我们的反思才刚刚开始#，类型：综合，一共有 8页，已经解析完毕！
INFO:root:Main body data saved to ./#胖猫事件带给我们的反思才刚刚开始#/demo.csv.
INFO:root:Found 78 main body entries, starting level 1 comments parsing...


Level 1 comments


Parsing main body: 100%|██████████| 78/78 [00:47<00:00,  1.64it/s]
INFO:root:Completed parsing all 78 main body entries for level 1 comments.
INFO:root:Data successfully saved to ./#胖猫事件带给我们的反思才刚刚开始#/demo_comments_one.csv.
INFO:root:Found 415 level 1 comments, starting level 2 parsing...


Level 2 comments


Parsing level 1 comments: 100%|██████████| 415/415 [02:53<00:00,  2.39it/s]
INFO:root:Completed parsing all 415 level 1 comments for level 2 comments.
INFO:root:Data successfully saved to ./#胖猫事件带给我们的反思才刚刚开始#/demo_comments_two.csv.


1 minutes break


100%|██████████| 60/60 [01:00<00:00,  1.00s/it]
INFO:root:Successfully created root path: #胖猫案细节公布#!


Main body


INFO:root:话题：#胖猫案细节公布#，类型：综合，解析成功，一共有35页，准备开始解析...
解析中...: 100%|██████████| 34/34 [00:23<00:00,  1.46it/s]
INFO:root:话题：#胖猫案细节公布#，类型：综合，一共有35页，已经解析完毕！
INFO:root:Main body data saved to ./#胖猫案细节公布#/demo.csv.
INFO:root:Found 339 main body entries, starting level 1 comments parsing...


Level 1 comments


Parsing main body: 100%|██████████| 339/339 [19:49<00:00,  3.51s/it]   
INFO:root:Completed parsing all 339 main body entries for level 1 comments.
INFO:root:Data successfully saved to ./#胖猫案细节公布#/demo_comments_one.csv.
INFO:root:Found 5311 level 1 comments, starting level 2 parsing...


Level 2 comments


Parsing level 1 comments:   0%|          | 1/5311 [16:01<1418:06:05, 961.42s/it]
ERROR:root:Error during level 2 comments parsing: 解析页面失败，请检查你的cookie是否正确！
INFO:root:Data successfully saved to ./#胖猫案细节公布#/demo_comments_two.csv.


1 minutes break


100%|██████████| 60/60 [01:00<00:00,  1.00s/it]
INFO:root:Successfully created root path: #胖猫事件#!


Main body


INFO:root:话题：#胖猫事件#，类型：综合，解析成功，一共有47页，准备开始解析...
解析中...: 100%|██████████| 46/46 [00:22<00:00,  2.01it/s]
INFO:root:话题：#胖猫事件#，类型：综合，一共有47页，已经解析完毕！
INFO:root:Main body data saved to ./#胖猫事件#/demo.csv.
INFO:root:Found 370 main body entries, starting level 1 comments parsing...


Level 1 comments


Parsing main body:   1%|          | 4/370 [15:11<23:09:57, 227.86s/it]
ERROR:root:Error during level 1 comments parsing: 解析页面失败，请检查你的cookie是否正确！
INFO:root:Data successfully saved to ./#胖猫事件#/demo_comments_one.csv.
INFO:root:Found 953 level 1 comments, starting level 2 parsing...


Level 2 comments


Parsing level 1 comments:  44%|████▎     | 415/953 [03:44<04:51,  1.84it/s]
ERROR:root:Error during level 2 comments parsing: 解析页面失败，请检查你的cookie是否正确！
INFO:root:Data successfully saved to ./#胖猫事件#/demo_comments_two.csv.


1 minutes break


100%|██████████| 60/60 [01:00<00:00,  1.00s/it]
INFO:root:Successfully created root path: #警方通报胖猫事件调查情况#!


Main body


INFO:root:话题：#警方通报胖猫事件调查情况#，类型：综合，解析成功，一共有38页，准备开始解析...
解析中...: 100%|██████████| 37/37 [00:20<00:00,  1.79it/s]
INFO:root:话题：#警方通报胖猫事件调查情况#，类型：综合，一共有38页，已经解析完毕！
INFO:root:Main body data saved to ./#警方通报胖猫事件调查情况#/demo.csv.
INFO:root:Found 347 main body entries, starting level 1 comments parsing...


Level 1 comments


Parsing main body: 100%|██████████| 347/347 [14:56<00:00,  2.58s/it]  
INFO:root:Completed parsing all 347 main body entries for level 1 comments.
INFO:root:Data successfully saved to ./#警方通报胖猫事件调查情况#/demo_comments_one.csv.
INFO:root:Found 3694 level 1 comments, starting level 2 parsing...


Level 2 comments


Parsing level 1 comments:  51%|█████▏    | 1896/3694 [1:37:37<1:32:35,  3.09s/it]   
ERROR:root:Error during level 2 comments parsing: 解析页面失败，请检查你的cookie是否正确！
INFO:root:Data successfully saved to ./#警方通报胖猫事件调查情况#/demo_comments_two.csv.


1 minutes break


100%|██████████| 60/60 [01:00<00:00,  1.00s/it]
INFO:root:Successfully created root path: #胖猫谭竹#!


Main body


INFO:root:话题：#胖猫谭竹#，类型：综合，解析成功，一共有44页，准备开始解析...
解析中...: 100%|██████████| 43/43 [00:19<00:00,  2.25it/s]
INFO:root:话题：#胖猫谭竹#，类型：综合，一共有44页，已经解析完毕！
INFO:root:Main body data saved to ./#胖猫谭竹#/demo.csv.
INFO:root:Found 412 main body entries, starting level 1 comments parsing...


Level 1 comments


Parsing main body:   0%|          | 2/412 [02:09<7:23:10, 64.86s/it]
ERROR:root:Error during level 1 comments parsing: 解析页面失败，请检查你的cookie是否正确！
INFO:root:Data successfully saved to ./#胖猫谭竹#/demo_comments_one.csv.
INFO:root:Found 142 level 1 comments, starting level 2 parsing...


Level 2 comments


Parsing level 1 comments:  16%|█▌        | 23/142 [04:34<23:42, 11.95s/it]
ERROR:root:Error during level 2 comments parsing: 解析页面失败，请检查你的cookie是否正确！
INFO:root:Data successfully saved to ./#胖猫谭竹#/demo_comments_two.csv.


1 minutes break


100%|██████████| 60/60 [01:00<00:00,  1.00s/it]


### 统计每个话题一级和二级评论
因为分了好几次爬虫，每个一级和二级评论会分开保存，但是合并的保存不一定完整

In [74]:
cur_dir = os.getcwd()
# cur_dir = "saved2"
# print(cur_dir)
comments_1 = 0 
comments_2 = 0
for dir in os.listdir(cur_dir):
    # print(dir)
    if dir.startswith('#'):
        print(dir)
        print("Comment level 1:")
        comment1_path = os.path.join(cur_dir, dir, "Comments_level_1")
        comment1_out = os.path.join(cur_dir, dir, "1_demo_comments.csv")
        comments_1 += merge_csv_files(comment1_path, comment1_out)
        try:
            print("Comment level 2:")
            comment2_path = os.path.join(cur_dir, dir, "Comments_level_2")
            comment2_out = os.path.join(cur_dir, dir, "2_demo_comments.csv")
            comments_2 += merge_csv_files(comment2_path, comment2_out)
        except:
            print("No level 2 comments")

print(f"level 1 comments: {comments_1}")
print(f"level 2 comments: {comments_2}")


#警方通报胖猫事件调查情况#
Comment level 1:
There are 3694 comments in total.
Comment level 2:
There are 5571 comments in total.
#胖猫谭竹#
Comment level 1:
There are 7328 comments in total.
Comment level 2:
没有找到 CSV 文件在文件夹: /home1/zhangxiao/projects/crawl/saved2/#胖猫谭竹#/Comments_level_2
No level 2 comments
#胖猫姐姐刘某操控舆论#
Comment level 1:
There are 6663 comments in total.
Comment level 2:
There are 3180 comments in total.
#胖猫事件#
Comment level 1:
There are 8255 comments in total.
Comment level 2:
There are 411 comments in total.
#胖猫事件带给我们的反思才刚刚开始#
Comment level 1:
There are 415 comments in total.
Comment level 2:
There are 134 comments in total.
#胖猫案细节公布#
Comment level 1:
There are 5311 comments in total.
Comment level 2:
There are 442 comments in total.
level 1 comments: 31666
level 2 comments: 9738


In [84]:
# 将以及评论合并到一起
df = pd.read_excel("/home1/zhangxiao/projects/crawl/final/comment_level_1.xlsx")
# print("unnamed" in df)
print(df.columns[0])
if "Unnamed: 0" in df.columns:
    df = df.drop(columns=["Unnamed: 0"])
df[:2]

Unnamed: 0


Unnamed: 0,main_body_mid,main_body_uid,发布时间,处理内容,评论地点,mid,回复数量,点赞数量,原生内容,uid,...,用户地理位置,用户性别,用户粉丝数量,用户关注数量,用户全部微博,用户累计评论,用户累计转发,用户累计获赞,用户转评赞,用户认证信息
0,5032806996771137,7513711425,24年05月11日 11:31,小三姑你跳河我会伤心的,来自贵州,5032807607304440,2,0,小三姑你跳河我会伤心的,7914578798,...,贵州,m,116,398,112,696,7,586,1289,
1,5032806996771137,7513711425,24年05月11日 11:30,"你没事吧<img alt=""[二哈]"" title=""[二哈]"" src=""https://...",来自上海,5032807418824048,0,0,你没事吧[二哈],6530018710,...,海外,m,653,48,287,477,53,1172,1702,


### 将获取的csv 结果按话题保存到excel中
为了重新测试爬虫，备份了一次，将现有数据和备份数据进行合并然后进行保存

In [120]:
root_path = "/home1/zhangxiao/projects/crawl"

def process_comments_for_theme(save_paths, theme, level):
    """
    处理指定话题下的评论数据，并返回合并后的 DataFrame 和该话题的统计数据。
    
    参数:
    - save_paths: 保存文件路径列表。
    - theme: 当前话题。
    - level: 评论的层级（0为demo，1和2为一级、二级评论）。
    
    返回:
    - merged_df: 合并后的 DataFrame。
    - total_items: 合并后的评论条数。
    """
    data_list = []
    seperate_items = []

    for s in save_paths:
        comment_csv = os.path.join(s, theme, f"{'demo' if level == 0 else f'{level}_demo_comments'}.csv")
        try:
            df = pd.read_csv(comment_csv)
            # print(f"{s.split('/')[-1]}: {len(df)} items for theme: {theme}")
            data_list.append(df)
            seperate_items.append(len(df))
        except:
            print(f"No such csv file: {comment_csv}")

    

    if data_list:
        # 合并并去重
        merged_df = pd.concat(data_list).reset_index(drop=True).astype(str)
        if "Unnamed: 0" in merged_df.columns:
            merged_df = merged_df.drop(columns="Unnamed: 0")
        merged_df = merged_df.drop_duplicates()
        total_items = len(merged_df)
        return merged_df, seperate_items, total_items
    else:
        print(f"No data to merge for theme: {theme}")
        return pd.DataFrame(),None, 0

In [147]:
def get_all_comments(level,output_path=None):
    """
    根据指定的评论层级，处理所有话题的评论数据，并保存统计信息到一个 DataFrame 中。
    
    参数:
    - level: 评论层级，0为demo，1和2为一级、二级评论。
    
    返回:
    - stats_df: 统计信息的 DataFrame。
    """
    save_paths = [os.path.join(root_path, "saved"), os.path.join(root_path, "saved2")]
    themes = [
        '#胖猫姐姐刘某操控舆论#', 
        '#胖猫事件带给我们的反思才刚刚开始#', 
        '#胖猫案细节公布#', 
        '#胖猫事件#', 
        '#警方通报胖猫事件调查情况#', 
        '#胖猫谭竹#'
    ]

    # 初始化用于保存统计数据的 DataFrame
    # stats_df = pd.DataFrame(columns=['Theme', 'Total_Comments'])
    total_number = 0
    print(f"Processing {'demo' if level == 0 else f'level {level}'} comments")
    
    # 仅供观赏，dataframe 保存统计结果
    stats_dict = dict()
    
    for theme in themes:
        # print(f"Processing theme: {theme}")
        merged_df, sep_items, total_items = process_comments_for_theme(save_paths, theme, level)
        stats_dict[theme] = {
            'total_num': total_items
        }
        for i in range(len(sep_items)):
            stats_dict[theme][f'num items{i+1}'] = sep_items[i]
        # print(theme, sep_items, total_items)
        total_number += total_items
        if output_path is not None:
            merged_df.to_excel(output_path)
        # 将统计结果保存到 DataFrame 中
        # stats_df = stats_df({'Theme': theme, 'Total_Comments': total_items}, ignore_index=True)
    
    stats_df = pd.DataFrame(stats_dict).T
    stats_df = stats_df.fillna(0).astype(int)
    # print(stats_df)
    return total_number, stats_df

In [148]:
# 示例调用，处理 demo 评论，保存统计信息到 DataFrame 中
# 如果需要保存，在get_all_comments中添加output_path 就好了
num_demo, demo_df = get_all_comments(0)
print(f"There are totally {num_demo} demos.")
num_com1, com1_df = get_all_comments(1)
print(f"There are totally {num_com1} level 1 comments.")
num_com2, com2_df = get_all_comments(2)
print(f"There are totally {num_com2} level 2 comments.")
# print(stats_df)

Processing demo comments
There are totally 2205 demos.
Processing level 1 comments
There are totally 39075 level 1 comments.
Processing level 2 comments
No such csv file: /home1/zhangxiao/projects/crawl/saved2/#胖猫谭竹#/2_demo_comments.csv
There are totally 19086 level 2 comments.


In [149]:
demo_df

Unnamed: 0,total_num,num items1,num items2
#胖猫姐姐刘某操控舆论#,421,414,414
#胖猫事件带给我们的反思才刚刚开始#,79,79,78
#胖猫案细节公布#,339,339,339
#胖猫事件#,386,369,365
#警方通报胖猫事件调查情况#,469,244,347
#胖猫谭竹#,511,230,414


In [150]:
com1_df

Unnamed: 0,total_num,num items1,num items2
#胖猫姐姐刘某操控舆论#,9033,7296,6663
#胖猫事件带给我们的反思才刚刚开始#,491,416,415
#胖猫案细节公布#,6277,5293,5311
#胖猫事件#,8448,494,8255
#警方通报胖猫事件调查情况#,5138,1535,3694
#胖猫谭竹#,9688,2629,7328


In [151]:
com2_df

Unnamed: 0,total_num,num items1,num items2
#胖猫姐姐刘某操控舆论#,5075,1895,3180
#胖猫事件带给我们的反思才刚刚开始#,145,134,134
#胖猫案细节公布#,1985,1912,442
#胖猫事件#,499,348,411
#警方通报胖猫事件调查情况#,7985,2563,5571
#胖猫谭竹#,3397,3397,0


### 将每个话题的数据统计到同一excel文件下


In [155]:
final_path = os.getcwd() + "/final"
comment_1_list = []
comment_2_list = []
demo_list = []

for file in os.listdir(final_path):
    if file.endswith(".xlsx"):
        df = pd.read_excel(os.path.join(final_path, file))
        if file.endswith("1.xlsx"):
            comment_1_list.append(df)
        elif file.endswith("2.xlsx"):
            comment_2_list.append(df)
        elif file.endswith("demo.xlsx"):
            demo_list.append(df)

merged_df1 = pd.concat(comment_1_list).reset_index(drop=True).astype(str).drop(columns="Unnamed: 0").drop_duplicates()
merged_df2 = pd.concat(comment_2_list).reset_index(drop=True).astype(str).drop(columns="Unnamed: 0").drop_duplicates()
merged_demo = pd.concat(demo_list).reset_index(drop=True).astype(str).drop(columns="Unnamed: 0").drop_duplicates()

print(f"There are totally {len(merged_df1)} level 1 comments.")
print(f"There are totally {len(merged_df2)} level 2 comments.")
print(f"There are totally {len(merged_demo)} demos.")

There are totally 38102 level 1 comments.
There are totally 19086 level 2 comments.
There are totally 2807 demos.


In [156]:
merged_df1[:10]

Unnamed: 0,main_body_mid,main_body_uid,发布时间,处理内容,评论地点,mid,回复数量,点赞数量,原生内容,uid,...,用户地理位置,用户性别,用户粉丝数量,用户关注数量,用户全部微博,用户累计评论,用户累计转发,用户累计获赞,用户转评赞,用户认证信息
0,5032806996771137,7513711425,24年05月11日 11:31,小三姑你跳河我会伤心的,来自贵州,5032807607304440,2,0,小三姑你跳河我会伤心的,7914578798,...,贵州,m,116,398,112,696,7,586,1289,
1,5032806996771137,7513711425,24年05月11日 11:30,"你没事吧<img alt=""[二哈]"" title=""[二哈]"" src=""https://...",来自上海,5032807418824048,0,0,你没事吧[二哈],6530018710,...,海外,m,653,48,287,477,53,1172,1702,
2,5031558065161616,6013715279,24年05月08日 00:56,干脆直接单身吧这些男的,来自江苏,5031560537703260,0,1,干脆直接单身吧这些男的,7442435940,...,其他,m,0,74,7,0,0,17,17,
3,5031367307690922,2400903283,24年05月08日 06:36,转行情感博主了,来自加拿大,5031646275043522,3,1,转行情感博主了,2487758140,...,海外 加拿大,m,262,746,1289,630,198,182,1010,
4,5031367307690922,2400903283,24年05月07日 12:56,真的，我最想说的就是，爱人先爱己，你的人生不是只有另一半，生命很重要也很脆弱，希望大家都可以...,来自河南,5031379446269610,0,2,真的，我最想说的就是，爱人先爱己，你的人生不是只有另一半，生命很重要也很脆弱，希望大家都可以...,6977404515,...,其他,m,13,831,801,89,5,746,840,
5,5031367307690922,2400903283,24年05月07日 12:47,"你真的说的很好<img alt=""[打call]"" title=""[打call]"" src=...",来自湖北,5031377187901862,0,0,你真的说的很好[打call],6424201018,...,其他,m,1,8,9,4,0,3,7,
6,5031367307690922,2400903283,24年05月07日 12:46,说得好,来自河南,5031376889839837,0,0,说得好,7597954267,...,其他,f,1,2,0,0,0,0,0,
7,5032806801212580,7512842757,24年05月11日 11:33,支持谭竹,来自江苏,5032808093320739,6,52,支持谭竹,7292530205,...,北京,m,41,77,2644,18,3,92,113,
8,5032806801212580,7512842757,24年05月11日 20:05,"有点幽默了，左上角的照片为什么不发原图，还要给他美颜呢<img alt=""[允悲]"" tit...",来自河南,5032936841674883,6,6,有点幽默了，左上角的照片为什么不发原图，还要给他美颜呢[允悲][允悲]，不会是觉得太稠拿不出...,7906565460,...,其他,f,8,93,33,13,26,12,51,
9,5032806801212580,7512842757,24年05月11日 11:31,怎么天天都是他,来自江苏,5032807472827574,2,10,怎么天天都是他,3107567797,...,广东 汕头,m,8,93,4203,14,1,40,55,


In [157]:
c1_excel = os.path.join(final_path, "all_together", "comment_level_1.xlsx")
merged_df1.to_excel(c1_excel)
print(f"Successfully saved level 1 comments in {c1_excel}")

c2_excel = os.path.join(final_path, "all_together", "comment_level_2.xlsx")
merged_df1.to_excel(c2_excel)
print(f"Successfully saved level 2 comments in {c2_excel}")

demo_excel = os.path.join(final_path, "all_together", "demo.xlsx")
merged_df1.to_excel(demo_excel)
print(f"Successfully saved demo in {demo_excel}")


Successfully saved level 1 comments in /home1/zhangxiao/projects/crawl/final/all_together/comment_level_1.xlsx
Successfully saved level 2 comments in /home1/zhangxiao/projects/crawl/final/all_together/comment_level_2.xlsx
Successfully saved demo in /home1/zhangxiao/projects/crawl/final/all_together/demo.xlsx
