In [1]:
from selenium import webdriver
from selenium.webdriver.edge.service import Service as EdgeService
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver.edge.options import Options as EdgeOptions

import datetime
import pandas as pd

In [2]:
# 设置 EdgeOptions
options = EdgeOptions()
options.add_argument("--headless")  # 无头模式
# 初始化 WebDriver
driver = webdriver.Edge(service=EdgeService(EdgeChromiumDriverManager().install()), options=options)

today = datetime.datetime.now().strftime("%m-%d")
yesterday = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime("%m-%d")
companies = pd.read_csv("data/sichuan.csv")

In [3]:
urls = companies["website"]
names =  companies["name"]
codes = companies["code"]

In [4]:
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import pandas as pd

In [12]:
category_list = []

# 遍历每个URL进行抓取
for i in range(len(urls)):
    print(urls[i])
    driver.get(urls[i])

    try:
        div = driver.find_element(By.CSS_SELECTOR, "div.xgbk")
        content = div.get_attribute('outerHTML')
        
        soup = BeautifulSoup(content, 'html.parser')
        category = soup.find_all("a")
        # 提取文本
        text = category[0].get_text(strip=True)  # strip=True 去掉多余的空格或换行符
        print(text)
    except Exception as e:
        text = "未知"
    category_list.append(text)
    

https://quote.eastmoney.com/unify/r/0.002466
能源金属
https://quote.eastmoney.com/unify/r/0.000598
公用事业
https://quote.eastmoney.com/unify/r/1.601838
银行
https://quote.eastmoney.com/unify/r/0.002312
化肥行业
https://quote.eastmoney.com/unify/r/0.300841
生物制品
https://quote.eastmoney.com/unify/r/1.600603
物流行业
https://quote.eastmoney.com/unify/r/0.000876
农牧饲渔
https://quote.eastmoney.com/unify/r/1.601208
塑料制品
https://quote.eastmoney.com/unify/r/0.000568
酿酒行业
https://quote.eastmoney.com/unify/r/0.000155
电力行业
https://quote.eastmoney.com/unify/r/1.600039
工程建设
https://quote.eastmoney.com/unify/r/0.000858
酿酒行业
https://quote.eastmoney.com/unify/r/0.002697
商业百货
https://quote.eastmoney.com/unify/r/1.688709
半导体
https://quote.eastmoney.com/unify/r/0.000688
有色金属
https://quote.eastmoney.com/unify/r/0.002497
化学制品
https://quote.eastmoney.com/unify/r/0.301050
半导体
https://quote.eastmoney.com/unify/r/1.600793
造纸印刷
https://quote.eastmoney.com/unify/r/1.688302
生物制品
https://quote.eastmoney.com/unify/r/1.688283
专用设备
http

In [13]:
companies["category"] = category_list

In [16]:
companies.to_csv("data/sichuan_new.csv", index=False)

In [19]:
unique_categories = set(category_list)
# 将去重后的类别保存到txt文件
file_path = "data/categories.txt"  # 文件名

with open(file_path, "w", encoding="utf-8") as file:
    for category in unique_categories:
        file.write(category + "\n")

print(f"Unique categories have been saved to {file_path}")

Unique categories have been saved to data/categories.txt


In [2]:
# 读取 CSV 文件
file_path = "data/sichuan_new.csv"  # 替换为你的文件路径
df = pd.read_csv(file_path)

# 按 category 列分组
grouped = df.groupby("category")

# 遍历每个分组并保存为单独的文件
output_dir = "output_categories/"  # 输出目录
import os
os.makedirs(output_dir, exist_ok=True)  # 创建目录

for category, group in grouped:
    output_file = os.path.join(output_dir, f"{category}.csv")
    group.to_csv(output_file, index=False, encoding="utf-8")
    print(f"Saved {category} group to {output_file}")

Saved 专用设备 group to output_categories/专用设备.csv
Saved 中药 group to output_categories/中药.csv
Saved 互联网服务 group to output_categories/互联网服务.csv
Saved 仪器仪表 group to output_categories/仪器仪表.csv
Saved 光伏设备 group to output_categories/光伏设备.csv
Saved 光学光电子 group to output_categories/光学光电子.csv
Saved 公用事业 group to output_categories/公用事业.csv
Saved 农牧饲渔 group to output_categories/农牧饲渔.csv
Saved 农药兽药 group to output_categories/农药兽药.csv
Saved 包装材料 group to output_categories/包装材料.csv
Saved 化学制品 group to output_categories/化学制品.csv
Saved 化学制药 group to output_categories/化学制药.csv
Saved 化学原料 group to output_categories/化学原料.csv
Saved 化肥行业 group to output_categories/化肥行业.csv
Saved 医疗器械 group to output_categories/医疗器械.csv
Saved 医疗服务 group to output_categories/医疗服务.csv
Saved 医药商业 group to output_categories/医药商业.csv
Saved 半导体 group to output_categories/半导体.csv
Saved 商业百货 group to output_categories/商业百货.csv
Saved 塑料制品 group to output_categories/塑料制品.csv
Saved 家用轻工 group to output_categories/家用轻工.csv
Saved 家电行业 grou

In [6]:
# 创建一个新的 DataFrame，将每组数据依次添加，并用空行隔开
output_df = pd.DataFrame()

for category, group in grouped:
    output_df = pd.concat([output_df, group, pd.DataFrame([{}])])  # 添加空行

# 保存到新的 CSV 文件
output_file = "data/sichuan_grouped.csv"
output_df.to_csv(output_file, index=False, encoding="utf-8")

print(f"Grouped data saved to {output_file}")

Grouped data saved to data/sichuan_grouped.csv
