# 爬虫

## 基于requests的单线程爬虫

In [None]:
from news_crawler.crawler_requests import NewsCrawler
crawler = NewsCrawler('cn', 100)
crawler.crawl()
crawler.save_data('data/cn')

In [None]:
from news_crawler.crawler_requests import NewsCrawler
crawler = NewsCrawler('en', 100)
crawler.crawl()
crawler.save_data('data/cn')

## 基于Scrapy的并发爬虫

Scrapy 是一个非常强大的 Python 框架，用于 Web 爬虫和数据抓取。它可以轻松地爬取网站上的数据，并将其存储在所需的格式中（如 CSV、JSON 或数据库）。

首先在项目根目录下创建名为`news_crawler`的Scrapy爬虫项目

```bash
scrapy startproject news_crawler
```

生成一个爬虫模板，稍后按本实验的需求修改：

```bash
scrapy genspider example quotes.toscrape.com
```

### 爬取中文数据
通过Scrapy框架运行爬虫：

```bash
cd news_crawler
scrapy crawl news_spider -s CLOSESPIDER_ITEMCOUNT=10000 -s OUTPUT_DIR="../../data/cn" -a language="cn" -a start_keyword="1"
```

### 爬取英文数据
通过Scrapy框架运行爬虫，只是更换语言即可：

```bash
cd news_crawler
scrapy crawl news_spider -s CLOSESPIDER_ITEMCOUNT=20000 -s OUTPUT_DIR="../../data/en" -a language="en" -a start_keyword="1"
```

```
2024-09-25 12:52:45 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 8349774,
 'downloader/request_count': 23042,
 'downloader/request_method_count/GET': 23042,
 'downloader/response_bytes': 74200410,
 'downloader/response_count': 23042,
 'downloader/response_status_count/200': 23042,
 'dupefilter/filtered': 2200,
 'elapsed_time_seconds': 267.455448,
 'finish_reason': 'closespider_itemcount',
 'finish_time': datetime.datetime(2024, 9, 25, 4, 52, 45, 644444, tzinfo=datetime.timezone.utc),
 'httpcompression/response_bytes': 217377512,
 'httpcompression/response_count': 22651,
 'item_scraped_count': 20025,
 'log_count/DEBUG': 43073,
 'log_count/INFO': 23444,
 'memusage/max': 168902656,
 'memusage/startup': 74584064,
 'offsite/domains': 1,
 'offsite/filtered': 1,
 'request_depth_max': 47,
 'response_received_count': 23042,
 'scheduler/dequeued': 23042,
 'scheduler/dequeued/memory': 23042,
 'scheduler/enqueued': 27585,
 'scheduler/enqueued/memory': 27585,
 'start_time': datetime.datetime(2024, 9, 25, 4, 48, 18, 188996, tzinfo=datetime.timezone.utc)}
2024-09-25 12:52:45 [scrapy.core.engine] INFO: Spider closed (closespider_itemcount)
```

# 数据处理

## 去除乱码

In [None]:
import json
from tqdm import tqdm
from utils.cleaning import clean_cn, clean_en

# cn
with open('data/cn/data.json', 'r') as f:
    data = json.load(f)
for news in tqdm(data):
    news['content'] = clean_cn(news['content'])
with open('data/cn/washed.json', 'w') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)
    
# en
with open('data/en/data.json', 'r') as f:
    data = json.load(f)
for news in tqdm(data):
    news['content'] = clean_en(news['content'])
with open('data/en/washed.json', 'w') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

## 分词

### 中文分词
使用jieba分词，全部文本储存在`data/cn/tokenized.txt`中。

In [None]:
import json
from tqdm import tqdm
from utils.tokenization import tokenize_cn, tokenize_en

# cn
sentences: list[list[str]] = []
with open('data/cn/washed.json', 'r') as f:
    data = json.load(f)
for news in tqdm(data):
    sentences.extend(tokenize_cn(news['content'], min_len=8))
data_size = sum([len(sentence) for sentence in sentences])
print(f"cn data size: {data_size}")
with open('data/cn/tokenized.txt', 'w') as f:
    sentences = [' '.join(sentence) + '\n' for sentence in sentences]
    f.writelines(sentences)

### 英文分词

In [None]:
# en
sentences: list[list[str]] = []
with open('data/en/washed.json', 'r') as f:
    data = json.load(f)
for news in tqdm(data):
    sentences.extend(tokenize_en(news['content'], min_len=8))
data_size = sum([len(sentence) for sentence in sentences])
print(f"en data size: {data_size}")
with open('data/en/tokenized.txt', 'w') as f:
    sentences = [' '.join(sentence) + '\n' for sentence in sentences]
    f.writelines(sentences)

# 提取日期

In [None]:
from tqdm import tqdm
from utils.date import extract_date
    
dates = []
for news in tqdm(data):
    dates.extend(extract_date(news['content']))

with open('data/cn/dates.json', 'w') as f:
    json.dump(dates, f, ensure_ascii=False, indent=4)

# 验证Chef定律

In [None]:
# Import necessary modules
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

# Load the tokenized JSON files for Chinese and English words
cn_file_path = 'data/cn/tokenized.txt'
en_file_path = 'data/en/tokenized.txt'

with open(cn_file_path, 'r', encoding='utf-8') as cn_file:
    cn_words = cn_file.read().split()

with open(en_file_path, 'r', encoding='utf-8') as en_file:
    en_words = en_file.read().split()

# Count the frequency of each word
cn_word_freq = Counter(cn_words)
en_word_freq = Counter(en_words)

# Sort the word frequencies in descending order
sorted_cn_freq = sorted(cn_word_freq.items(), key=lambda x: x[1], reverse=True)
sorted_en_freq = sorted(en_word_freq.items(), key=lambda x: x[1], reverse=True)

# Get the rank (position) and frequency for Chinese and English words
cn_ranks = np.arange(1, len(sorted_cn_freq) + 1)
cn_frequencies = np.array([freq for word, freq in sorted_cn_freq])

en_ranks = np.arange(1, len(sorted_en_freq) + 1)
en_frequencies = np.array([freq for word, freq in sorted_en_freq])

# Convert rank and frequency to log scale
log_cn_ranks = np.log10(cn_ranks)
log_cn_frequencies = np.log10(cn_frequencies)

log_en_ranks = np.log10(en_ranks)
log_en_frequencies = np.log10(en_frequencies)

# Fit a linear model (for log-transformed data)
cn_fit = np.polyfit(log_cn_ranks, log_cn_frequencies, 1)
en_fit = np.polyfit(log_en_ranks, log_en_frequencies, 1)

# Generate the fitted lines
fitted_cn_frequencies = cn_fit[0] * log_cn_ranks + cn_fit[1]
fitted_en_frequencies = en_fit[0] * log_en_ranks + en_fit[1]

# Plotting log-log data and linear fits on a regular linear scale
plt.figure(figsize=(10, 5))

# Chinese words subplot
plt.subplot(1, 2, 1)
plt.plot(log_cn_ranks, log_cn_frequencies, label='Original Data')
plt.plot(log_cn_ranks, fitted_cn_frequencies, linestyle='--', label=f'Fit: slope={cn_fit[0]:.2f}')
plt.title("Log-Log Plot - CN", fontsize=14)
plt.xlabel("Log Rank (base 10)", fontsize=12)
plt.ylabel("Log Frequency (base 10)", fontsize=12)
plt.xticks(np.arange(int(min(log_cn_ranks)), int(max(log_cn_ranks)) + 1, 1))  # Set x-axis ticks
plt.yticks(np.arange(int(min(log_cn_frequencies)), int(max(log_cn_frequencies)) + 1, 1))  # Set y-axis ticks
plt.legend()
plt.grid(True)

# English words subplot
plt.subplot(1, 2, 2)
plt.plot(log_en_ranks, log_en_frequencies, label='Original Data')
plt.plot(log_en_ranks, fitted_en_frequencies, linestyle='--', label=f'Fit: slope={en_fit[0]:.2f}')
plt.title("Log-Log Plot - EN", fontsize=14)
plt.xlabel("Log Rank (base 10)", fontsize=12)
plt.ylabel("Log Frequency (base 10)", fontsize=12)
plt.xticks(np.arange(int(min(log_en_ranks)), int(max(log_en_ranks)) + 1, 1))  # Set x-axis ticks
plt.yticks(np.arange(int(min(log_en_frequencies)), int(max(log_en_frequencies)) + 1, 1))  # Set y-axis ticks
plt.legend()
plt.grid(True)

# Display the plots
plt.tight_layout()
plt.show()


## CBOW

## 加载数据集

In [2]:
from models.dataset import CBOWDataSet

# Load the tokenized data
cn_tokenized_file_path = 'data/en/tokenized.txt'
dataset = CBOWDataSet(cn_tokenized_file_path, window_size=5, min_count=10)
dataset.save('data/en/dataset.json')
dataset.vocab.save('data/en/vocab.pth')
print(f"Dataset size: {len(dataset)}")

Processing data: 100%|██████████| 193071/193071 [00:00<00:00, 385238.22it/s]
Traversing words: 100%|██████████| 6773154/6773154 [00:01<00:00, 4128751.43it/s]


Total unique words: 73019
Sorting words based on frequency...
Filtering words based on min_count...
Building word2idx mapping...
Building frequency list...


Converting sentences to indices: 100%|██████████| 193071/193071 [00:15<00:00, 12575.79it/s]
Generating coordinates: 100%|██████████| 193071/193071 [00:00<00:00, 412200.33it/s]


Dataset size: 4842444


In [1]:
from models.dataset import CBOWDataSet

cn_tokenized_file_path = 'data/en/dataset.json'
dataset = CBOWDataSet(cn_tokenized_file_path)
print(f"Dataset size: {len(dataset)}")

Generating coordinates: 100%|██████████| 193071/193071 [00:00<00:00, 283967.44it/s]

Dataset size: 4842444





In [2]:
from models.vocab import Vocabulary
vocab = Vocabulary.load('data/en/vocab.pth')
print(f"Vocab size: {len(vocab.vocab)}")

Vocab size: 19091


  checkpoint = torch.load(path)


In [3]:
from models.cbow import CBOW
from models.word2vec import Word2Vec

model = CBOW(len(vocab), 128)
word2vec = Word2Vec(model, vocab)

In [5]:
from models.dataset import CBOWDataLoader

train_loader, test_loader = dataset.partition(
    batch_size=512,
    neg_size=16,
)
word2vec.train(train_loader, epochs=1, lr=1e-3)



Shuffling coordinates...
Partitioning dataset...
Shuffling coordinates...


Epoch 1/1: 100%|██████████| 7660/7660 [01:42<00:00, 74.78it/s]
Validation: 100%|██████████| 851/851 [00:08<00:00, 96.04it/s] 


Validation Loss: 0.1791, Validation Accuracy: 0.0237


RuntimeError: [enforce fail at inline_container.cc:642] . invalid file name: weights/en/

In [6]:
word2vec.save('weights/en/')

RuntimeError: [enforce fail at inline_container.cc:642] . invalid file name: weights/en/

In [7]:
print(vocab[model.nearest(vocab['good'])])

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)