# 爬虫

## 基于requests的单线程爬虫

In [4]:
from crawler.news import NewsCrawler
crawler = NewsCrawler()

In [None]:
crawler.crawl()

In [None]:
crawler.save_data('data/cn')

In [None]:
import json
import jieba
from tqdm import tqdm

with open('data/cn/washed.json', 'r') as f:
    data = json.load(f)

# averange length of news
total = 0
for news in tqdm(data):
    total += len(jieba.lcut(news['content']))
print(total/len(data))

In [None]:
import json
from tqdm import tqdm

with open('data/en/washed.json', 'r') as f:
    data = json.load(f)

# averange length of news
total = 0
for news in tqdm(data):
    total += len(news['content'].split())
print(total/len(data))

## 基于Scrapy的并发爬虫

Scrapy 是一个非常强大的 Python 框架，用于 Web 爬虫和数据抓取。它可以轻松地爬取网站上的数据，并将其存储在所需的格式中（如 CSV、JSON 或数据库）。

首先在项目根目录下创建名为`news_crawler`的Scrapy爬虫项目

```bash
scrapy startproject news_crawler
```

生成一个爬虫模板，稍后按本实验的需求修改：

```bash
scrapy genspider example quotes.toscrape.com
```

### 爬取中文数据
通过Scrapy框架运行爬虫：

```bash
cd news_crawler
scrapy crawl news_spider -s CLOSESPIDER_ITEMCOUNT=10000 -s OUTPUT_DIR="../../data/cn" -a language="cn" -a start_keyword="1"
```

### 爬取英文数据
通过Scrapy框架运行爬虫，只是更换语言即可：

```bash
cd news_crawler
scrapy crawl news_spider -s CLOSESPIDER_ITEMCOUNT=20000 -s OUTPUT_DIR="../../data/en" -a language="en" -a start_keyword="1"
```

# 数据处理

## 去除乱码

In [None]:
import json
import re
from tqdm import tqdm

def wash_cn(text: str) -> str:
    text = re.sub(r'\s+', ' ', text.strip())
    # 替换中文标点为标准形式
    text = re.sub(r'[！!]', '！', text)  # 统一感叹号
    text = re.sub(r'[。]', '。', text)    # 统一句号
    text = re.sub(r'[，,]', '，', text)   # 统一逗号
    text = re.sub(r'[\u3000]', ' ', text) # 去掉中文全角空格
    text = re.sub(r'[“”]', '', text)    # 去除引号
    text = re.sub(r'[^，。？！：；“”‘’\u4e00-\u9fa50-9]', '', text)  # 保留中文字符、标点和阿拉伯数字
    return text

with open('data/cn/data.json', 'r') as f:
    data = json.load(f)
    
for news in tqdm(data):
    news['content'] = wash_cn(news['content'])
    
with open('data/cn/washed.json', 'w') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

```
2024-09-25 12:52:45 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 8349774,
 'downloader/request_count': 23042,
 'downloader/request_method_count/GET': 23042,
 'downloader/response_bytes': 74200410,
 'downloader/response_count': 23042,
 'downloader/response_status_count/200': 23042,
 'dupefilter/filtered': 2200,
 'elapsed_time_seconds': 267.455448,
 'finish_reason': 'closespider_itemcount',
 'finish_time': datetime.datetime(2024, 9, 25, 4, 52, 45, 644444, tzinfo=datetime.timezone.utc),
 'httpcompression/response_bytes': 217377512,
 'httpcompression/response_count': 22651,
 'item_scraped_count': 20025,
 'log_count/DEBUG': 43073,
 'log_count/INFO': 23444,
 'memusage/max': 168902656,
 'memusage/startup': 74584064,
 'offsite/domains': 1,
 'offsite/filtered': 1,
 'request_depth_max': 47,
 'response_received_count': 23042,
 'scheduler/dequeued': 23042,
 'scheduler/dequeued/memory': 23042,
 'scheduler/enqueued': 27585,
 'scheduler/enqueued/memory': 27585,
 'start_time': datetime.datetime(2024, 9, 25, 4, 48, 18, 188996, tzinfo=datetime.timezone.utc)}
2024-09-25 12:52:45 [scrapy.core.engine] INFO: Spider closed (closespider_itemcount)
```

In [None]:

import json
from tqdm import tqdm

def wash_en(text: str) -> str:
    # 去除多余的空格和换行符
    text = re.sub(r'\s+', ' ', text.strip())
    # 替换英文标点为标准形式
    text = re.sub(r'[!]', '!', text)      # 统一感叹号
    text = re.sub(r'[.]', '.', text)      # 统一句号
    text = re.sub(r'[\,]', ',', text)      # 统一逗号
    text = re.sub(r'[;]', ';', text)      # 统一分号
    text = re.sub(r'["]', '', text)    # 去除引号
    text = re.sub(r'[^a-zA-Z0-9\s.,;!?]', '', text)  # 保留英文字符、标点和阿拉伯数字
    return text

with open('data/en/data.json', 'r') as f:
    data = json.load(f)
    
for news in tqdm(data):
    news['content'] = wash_en(news['content'])
    
with open('data/en/washed.json', 'w') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

## 分词

### 中文分词
使用jieba分词，全部文本储存在`data/cn/tokenized.txt`中。

In [None]:
import json
import re
import jieba
from tqdm import tqdm

def tokenize_cn(news: str) -> list[list[str]]:
    sentences = re.split(r'[。！？]', news)
    sentences = [s.strip() for s in sentences if s.strip()]
    return [jieba.lcut(s.strip()) for s in sentences]

sentences: list[list[str]] = []

with open('data/cn/washed.json', 'r') as f:
    data = json.load(f)
    
for news in tqdm(data):
    sentences.extend(tokenize_cn(news['content']))
    
with open('data/cn/tokenized.txt', 'w') as f:
    sentences = [' '.join(sentence) + '\n' for sentence in sentences]
    f.writelines(sentences)

### 英文分词

In [None]:
import json
import re
from tqdm import tqdm

def tokenize_en(news: str) -> list[list[str]]:
    sentences = re.split(r'[.?!]', news)
    sentences = [s.strip().replace(',', ',') for s in sentences if s.strip()]
    return [s.split() for s in sentences]

sentences: list[list[str]] = []

with open('data/en/washed.json', 'r') as f:
    data = json.load(f)
    
for news in tqdm(data):
    sentences.extend(tokenize_cn(news['content']))
    
with open('data/en/tokenized.txt', 'w') as f:
    sentences = [' '.join(sentence) + '\n' for sentence in sentences]
    f.writelines(sentences)

# 提取日期

In [None]:
import re
from tqdm import tqdm

def extract_date(text: str) -> list[str]:
    # 使用非捕获组 (?:) 避免捕获分隔符，如 "年", "月", "日"
    patterns = [
        r'\d{4}(?:年|-|/)\d{1,2}(?:月|-|/)\d{1,2}(?:日)?',  # 完整日期：YYYY-MM-DD
        r'\d{4}(?:年|-|/)\d{1,2}(?:月)?',                    # 年月：YYYY-MM
        r'\d{1,2}(?:月|-|/)\d{1,2}(?:日)?'                   # 月日：MM-DD
    ]
    
    dates = []
    for pattern in patterns:
        matches = re.findall(pattern, text)
        dates.extend(matches)
    return dates

with open('data/cn/data.json', 'r') as f:
    data = json.load(f)
    tokenized_results = tokenize_cn(data)
    
dates = []
for news in tqdm(data):
    dates.extend(extract_date(news['content']))

with open('data/cn/dates.json', 'w') as f:
    json.dump(dates, f, ensure_ascii=False, indent=4)

# 验证Chef定律

In [None]:
# Import necessary modules
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

# Load the tokenized JSON files for Chinese and English words
cn_file_path = 'data/cn/tokenized.txt'
en_file_path = 'data/en/tokenized.txt'

with open(cn_file_path, 'r', encoding='utf-8') as cn_file:
    cn_words = cn_file.read().split()

with open(en_file_path, 'r', encoding='utf-8') as en_file:
    en_words = en_file.read().split()

# Count the frequency of each word
cn_word_freq = Counter(cn_words)
en_word_freq = Counter(en_words)

# Sort the word frequencies in descending order
sorted_cn_freq = sorted(cn_word_freq.items(), key=lambda x: x[1], reverse=True)
sorted_en_freq = sorted(en_word_freq.items(), key=lambda x: x[1], reverse=True)

# Get the rank (position) and frequency for Chinese and English words
cn_ranks = np.arange(1, len(sorted_cn_freq) + 1)
cn_frequencies = np.array([freq for word, freq in sorted_cn_freq])

en_ranks = np.arange(1, len(sorted_en_freq) + 1)
en_frequencies = np.array([freq for word, freq in sorted_en_freq])

# Convert rank and frequency to log scale
log_cn_ranks = np.log10(cn_ranks)
log_cn_frequencies = np.log10(cn_frequencies)

log_en_ranks = np.log10(en_ranks)
log_en_frequencies = np.log10(en_frequencies)

# Fit a linear model (for log-transformed data)
cn_fit = np.polyfit(log_cn_ranks, log_cn_frequencies, 1)
en_fit = np.polyfit(log_en_ranks, log_en_frequencies, 1)

# Generate the fitted lines
fitted_cn_frequencies = cn_fit[0] * log_cn_ranks + cn_fit[1]
fitted_en_frequencies = en_fit[0] * log_en_ranks + en_fit[1]

# Plotting log-log data and linear fits on a regular linear scale
plt.figure(figsize=(10, 5))

# Chinese words subplot
plt.subplot(1, 2, 1)
plt.plot(log_cn_ranks, log_cn_frequencies, label='Original Data')
plt.plot(log_cn_ranks, fitted_cn_frequencies, linestyle='--', label=f'Fit: slope={cn_fit[0]:.2f}')
plt.title("Log-Log Plot - CN", fontsize=14)
plt.xlabel("Log Rank (base 10)", fontsize=12)
plt.ylabel("Log Frequency (base 10)", fontsize=12)
plt.xticks(np.arange(int(min(log_cn_ranks)), int(max(log_cn_ranks)) + 1, 1))  # Set x-axis ticks
plt.yticks(np.arange(int(min(log_cn_frequencies)), int(max(log_cn_frequencies)) + 1, 1))  # Set y-axis ticks
plt.legend()
plt.grid(True)

# English words subplot
plt.subplot(1, 2, 2)
plt.plot(log_en_ranks, log_en_frequencies, label='Original Data')
plt.plot(log_en_ranks, fitted_en_frequencies, linestyle='--', label=f'Fit: slope={en_fit[0]:.2f}')
plt.title("Log-Log Plot - EN", fontsize=14)
plt.xlabel("Log Rank (base 10)", fontsize=12)
plt.ylabel("Log Frequency (base 10)", fontsize=12)
plt.xticks(np.arange(int(min(log_en_ranks)), int(max(log_en_ranks)) + 1, 1))  # Set x-axis ticks
plt.yticks(np.arange(int(min(log_en_frequencies)), int(max(log_en_frequencies)) + 1, 1))  # Set y-axis ticks
plt.legend()
plt.grid(True)

# Display the plots
plt.tight_layout()
plt.show()


## CBOW

## 加载数据集

In [None]:
from model.cbow import CBOW
from model.data import CBOWDataSet, CBOWDataLoader

# Load the tokenized data
cn_tokenized_file_path = 'data/cn/tokenized.txt'
dataset = CBOWDataSet(cn_tokenized_file_path, window_size=5)
dataset.save('data/cn/dataset.json')

In [1]:
from model.data import CBOWDataSet, CBOWDataLoader

cn_tokenized_file_path = 'data/cn/dataset.json'
dataset = CBOWDataSet(cn_tokenized_file_path)
train_loader, valid_loader, test_loader = dataset.partition(
    batch_size=64,
    ratio=(0.8, 0.1, 0.1),
    neg_size=100
)

Generating coordinates: 100%|██████████| 214011/214011 [00:00<00:00, 290763.73it/s]


Shuffling coordinates...
Partitioning dataset...


In [12]:
import torch
import torch.nn as nn
from model.cbow import CBOW
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter

def compute_accuracy(output, targets):
    """
    计算准确率。
    :param output: 模型的输出 (batch_size, vocab_size)
    :param targets: 真实的目标 (batch_size,)
    :return: 准确率
    """
    _, predicted = torch.max(output, dim=1)  # 获取概率最大的类别
    correct = (predicted == targets).sum().item()  # 计算预测正确的个数
    accuracy = correct / targets.size(0)  # 计算准确率
    return accuracy

def train(model, 
          train_loader: CBOWDataLoader, 
          valid_loader: CBOWDataLoader, 
          epochs=10, 
          lr=0.0001,
          device='cpu',
          log_dir='./runs/experiment'):  # log_dir 是 TensorBoard 日志目录

    # 初始化 TensorBoard 记录器
    writer = SummaryWriter(log_dir)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = CBOW.loss
    
    model.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        total_accuracy = 0
        losses = []
        
        for batch_idx, pair in enumerate(tqdm(train_loader)):
            pair.to(device)
            optimizer.zero_grad()

            # 前向传播
            output = model.forward(pair.bags)

            # 计算损失
            loss = criterion(output, pair.targets, pair.negatives)
            loss.backward()

            # 更新模型参数
            optimizer.step()

            # 记录损失
            total_loss += loss.item()
            losses.append(loss.item())
            
            # 计算并记录准确率
            accuracy = compute_accuracy(output, pair.targets)
            total_accuracy += accuracy

            # 每50个批次打印一次训练损失
            if batch_idx % 50 == 0:
                avg_loss = total_loss / (batch_idx + 1)
                avg_accuracy = total_accuracy / (batch_idx + 1)
                print(f'Epoch {epoch+1}/{epochs}, Step {batch_idx}, Loss: {avg_loss:.4f}, Accuracy: {avg_accuracy:.4f}')

                # 使用 TensorBoard 记录损失和准确率
                writer.add_scalar('Training Loss', avg_loss, epoch * len(train_loader) + batch_idx)
                writer.add_scalar('Training Accuracy', avg_accuracy, epoch * len(train_loader) + batch_idx)

        # 模型验证
        model.eval()
        with torch.no_grad():
            total_val_loss = 0
            total_val_accuracy = 0

            for pair in valid_loader:
                pair.to(device)
                output = model(pair.bags)
                val_loss = criterion(output, pair.targets, pair.negatives)

                total_val_loss += val_loss.item()

                # 计算并记录验证准确率
                val_accuracy = compute_accuracy(output, pair.targets)
                total_val_accuracy += val_accuracy

            avg_val_loss = total_val_loss / len(valid_loader)
            avg_val_accuracy = total_val_accuracy / len(valid_loader)
            print(f'Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {avg_val_accuracy:.4f}')

            # 使用 TensorBoard 记录验证集的损失和准确率
            writer.add_scalar('Validation Loss', avg_val_loss, epoch)
            writer.add_scalar('Validation Accuracy', avg_val_accuracy, epoch)
            
    writer.close()

In [13]:
model = CBOW(len(dataset.vocab), 100)
train(model, train_loader, valid_loader, epochs=10, lr=0.001, device='cuda')

  0%|          | 0/68578 [00:00<?, ?it/s]

  0%|          | 3/68578 [00:00<1:44:36, 10.93it/s]

Epoch 1/10, Step 0, Loss: 55.2366, Accuracy: 0.0000


  0%|          | 70/68578 [00:00<10:13, 111.71it/s]

Epoch 1/10, Step 50, Loss: 51.6302, Accuracy: 0.0000


  0%|          | 122/68578 [00:01<09:24, 121.33it/s]

Epoch 1/10, Step 100, Loss: 51.0631, Accuracy: 0.0000


  0%|          | 174/68578 [00:01<09:04, 125.54it/s]

Epoch 1/10, Step 150, Loss: 50.6205, Accuracy: 0.0000


  0%|          | 214/68578 [00:02<08:54, 127.80it/s]

Epoch 1/10, Step 200, Loss: 50.5440, Accuracy: 0.0000


  0%|          | 268/68578 [00:02<08:53, 128.01it/s]

Epoch 1/10, Step 250, Loss: 50.2541, Accuracy: 0.0000


  0%|          | 320/68578 [00:02<09:15, 122.91it/s]

Epoch 1/10, Step 300, Loss: nan, Accuracy: 0.0000


  1%|          | 372/68578 [00:03<09:13, 123.29it/s]

Epoch 1/10, Step 350, Loss: nan, Accuracy: 0.0000


  1%|          | 424/68578 [00:03<09:15, 122.71it/s]

Epoch 1/10, Step 400, Loss: nan, Accuracy: 0.0000


  1%|          | 476/68578 [00:04<09:04, 125.05it/s]

Epoch 1/10, Step 450, Loss: nan, Accuracy: 0.0000


  1%|          | 515/68578 [00:04<09:01, 125.79it/s]

Epoch 1/10, Step 500, Loss: nan, Accuracy: 0.0000


  1%|          | 567/68578 [00:04<09:03, 125.15it/s]

Epoch 1/10, Step 550, Loss: nan, Accuracy: 0.0000


  1%|          | 619/68578 [00:05<08:55, 126.85it/s]

Epoch 1/10, Step 600, Loss: nan, Accuracy: 0.0000


  1%|          | 671/68578 [00:05<09:14, 122.51it/s]

Epoch 1/10, Step 650, Loss: nan, Accuracy: 0.0000


  1%|          | 723/68578 [00:06<09:06, 124.08it/s]

Epoch 1/10, Step 700, Loss: nan, Accuracy: 0.0000


  1%|          | 775/68578 [00:06<09:01, 125.23it/s]

Epoch 1/10, Step 750, Loss: nan, Accuracy: 0.0000


  1%|          | 814/68578 [00:06<09:04, 124.36it/s]

Epoch 1/10, Step 800, Loss: nan, Accuracy: 0.0000


  1%|▏         | 866/68578 [00:07<09:09, 123.24it/s]

Epoch 1/10, Step 850, Loss: nan, Accuracy: 0.0000


  1%|▏         | 918/68578 [00:07<09:00, 125.28it/s]

Epoch 1/10, Step 900, Loss: nan, Accuracy: 0.0000


  1%|▏         | 930/68578 [00:07<09:26, 119.35it/s]


KeyboardInterrupt: 