# 爬虫

## 基于requests的单线程爬虫

In [None]:
from news_crawler.crawler_requests import NewsCrawler
crawler = NewsCrawler('cn', 100)
crawler.crawl()
crawler.save_data('data/cn')

In [None]:
from news_crawler.crawler_requests import NewsCrawler
crawler = NewsCrawler('en', 100)
crawler.crawl()
crawler.save_data('data/cn')

## 基于Scrapy的并发爬虫

Scrapy 是一个非常强大的 Python 框架，用于 Web 爬虫和数据抓取。它可以轻松地爬取网站上的数据，并将其存储在所需的格式中（如 CSV、JSON 或数据库）。

首先在项目根目录下创建名为`news_crawler`的Scrapy爬虫项目

```bash
scrapy startproject news_crawler
```

生成一个爬虫模板，稍后按本实验的需求修改：

```bash
scrapy genspider example quotes.toscrape.com
```

### 爬取中文数据
通过Scrapy框架运行爬虫：

```bash
cd news_crawler
scrapy crawl news_spider -s CLOSESPIDER_ITEMCOUNT=10000 -s OUTPUT_DIR="../../data/cn" -a language="cn" -a start_keyword="1"
```

### 爬取英文数据
通过Scrapy框架运行爬虫，只是更换语言即可：

```bash
cd news_crawler
scrapy crawl news_spider -s CLOSESPIDER_ITEMCOUNT=20000 -s OUTPUT_DIR="../../data/en" -a language="en" -a start_keyword="1"
```

```
2024-09-25 12:52:45 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 8349774,
 'downloader/request_count': 23042,
 'downloader/request_method_count/GET': 23042,
 'downloader/response_bytes': 74200410,
 'downloader/response_count': 23042,
 'downloader/response_status_count/200': 23042,
 'dupefilter/filtered': 2200,
 'elapsed_time_seconds': 267.455448,
 'finish_reason': 'closespider_itemcount',
 'finish_time': datetime.datetime(2024, 9, 25, 4, 52, 45, 644444, tzinfo=datetime.timezone.utc),
 'httpcompression/response_bytes': 217377512,
 'httpcompression/response_count': 22651,
 'item_scraped_count': 20025,
 'log_count/DEBUG': 43073,
 'log_count/INFO': 23444,
 'memusage/max': 168902656,
 'memusage/startup': 74584064,
 'offsite/domains': 1,
 'offsite/filtered': 1,
 'request_depth_max': 47,
 'response_received_count': 23042,
 'scheduler/dequeued': 23042,
 'scheduler/dequeued/memory': 23042,
 'scheduler/enqueued': 27585,
 'scheduler/enqueued/memory': 27585,
 'start_time': datetime.datetime(2024, 9, 25, 4, 48, 18, 188996, tzinfo=datetime.timezone.utc)}
2024-09-25 12:52:45 [scrapy.core.engine] INFO: Spider closed (closespider_itemcount)
```

## 数据处理

### 去除乱码

In [None]:
import json
from tqdm import tqdm
from utils.cleaning import clean_cn, clean_en

# cn
with open('data/cn/data.json', 'r') as f:
    data = json.load(f)
for news in tqdm(data):
    news['content'] = clean_cn(news['content'])
with open('data/cn/washed.json', 'w') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)
    
# en
with open('data/en/data.json', 'r') as f:
    data = json.load(f)
for news in tqdm(data):
    news['content'] = clean_en(news['content'])
with open('data/en/washed.json', 'w') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

### 分词

#### 中文分词
使用jieba分词，全部文本储存在`data/cn/tokenized.txt`中。

In [None]:
import json
from tqdm import tqdm
from utils.tokenization import tokenize_cn, tokenize_en

# cn
sentences: list[list[str]] = []
with open('data/cn/washed.json', 'r') as f:
    data = json.load(f)
for news in tqdm(data):
    sentences.extend(tokenize_cn(news['content'], min_len=8))
data_size = sum([len(sentence) for sentence in sentences])
print(f"cn data size: {data_size}")
with open('data/cn/tokenized.txt', 'w') as f:
    sentences = [' '.join(sentence) + '\n' for sentence in sentences]
    f.writelines(sentences)

#### 英文分词

In [None]:
# en
sentences: list[list[str]] = []
with open('data/en/washed.json', 'r') as f:
    data = json.load(f)
for news in tqdm(data):
    sentences.extend(tokenize_en(news['content'], min_len=8))
data_size = sum([len(sentence) for sentence in sentences])
print(f"en data size: {data_size}")
with open('data/en/tokenized.txt', 'w') as f:
    sentences = [' '.join(sentence) + '\n' for sentence in sentences]
    f.writelines(sentences)

### 提取日期

In [None]:
import json
from tqdm import tqdm
from utils.date import extract_date

with open('data/cn/washed.json', 'r') as f:
    data = json.load(f)

dates = []
for news in tqdm(data):
    dates.extend(extract_date(news['content']))

with open('results/dates.json', 'w') as f:
    json.dump(dates, f, ensure_ascii=False, indent=4)

### 验证Zipf定律

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

# Load the tokenized JSON files for Chinese and English words
cn_file_path = 'data/cn/tokenized.txt'
en_file_path = 'data/en/tokenized.txt'

with open(cn_file_path, 'r', encoding='utf-8') as cn_file:
    cn_words = cn_file.read().split()

with open(en_file_path, 'r', encoding='utf-8') as en_file:
    en_words = en_file.read().split()

# Count the frequency of each word
cn_word_freq = Counter(cn_words)
en_word_freq = Counter(en_words)

# Sort the word frequencies in descending order
sorted_cn_freq = sorted(cn_word_freq.items(), key=lambda x: x[1], reverse=True)
sorted_en_freq = sorted(en_word_freq.items(), key=lambda x: x[1], reverse=True)

# Get the rank (position) and frequency for Chinese and English words
cn_ranks = np.arange(1, len(sorted_cn_freq) + 1)
cn_frequencies = np.array([freq for word, freq in sorted_cn_freq])

en_ranks = np.arange(1, len(sorted_en_freq) + 1)
en_frequencies = np.array([freq for word, freq in sorted_en_freq])

# Convert rank and frequency to log scale
log_cn_ranks = np.log10(cn_ranks)
log_cn_frequencies = np.log10(cn_frequencies)

log_en_ranks = np.log10(en_ranks)
log_en_frequencies = np.log10(en_frequencies)

# Plotting log-log data and linear fits on a regular linear scale
plt.figure(figsize=(10, 5))

# Chinese words subplot
plt.subplot(1, 2, 1)
plt.plot(log_cn_ranks, log_cn_frequencies, label='Original Data')
plt.title("Log-Log Plot - CN", fontsize=14)
plt.xlabel("Log Rank (base 10)", fontsize=12)
plt.ylabel("Log Frequency (base 10)", fontsize=12)
plt.xticks(np.arange(int(min(log_cn_ranks)), int(max(log_cn_ranks)) + 1, 1))  # Set x-axis ticks
plt.yticks(np.arange(int(min(log_cn_frequencies)), int(max(log_cn_frequencies)) + 1, 1))  # Set y-axis ticks
plt.legend()
plt.grid(True)

# English words subplot
plt.subplot(1, 2, 2)
plt.plot(log_en_ranks, log_en_frequencies, label='Original Data')
plt.title("Log-Log Plot - EN", fontsize=14)
plt.xlabel("Log Rank (base 10)", fontsize=12)
plt.ylabel("Log Frequency (base 10)", fontsize=12)
plt.xticks(np.arange(int(min(log_en_ranks)), int(max(log_en_ranks)) + 1, 1))  # Set x-axis ticks
plt.yticks(np.arange(int(min(log_en_frequencies)), int(max(log_en_frequencies)) + 1, 1))  # Set y-axis ticks
plt.legend()
plt.grid(True)

# Display the plots
plt.tight_layout()
plt.show()

### 分析不同主题下的词频差异

In [None]:
from utils.wordfreq_viz import plot_wordcloud
from utils.tokenization import tokenize_cn
import json
from tqdm import tqdm

with open('data/cn/washed.json', 'r', encoding='utf-8') as f:
    news = json.load(f)
    
with open('data/cn_stopwords.txt', 'r', encoding='utf-8') as f:
    stop_words = f.read().split()
    stop_words.extend(['年', '月', '日'])
    
site2news: dict[str, list[str]] = {}
for n in tqdm(news, desc='Classifying news by site'):
    if n['site'] not in site2news:
        site2news[n['site']] = []
    site2news[n['site']].append(n['content'].strip())

site2news = {site: contents for site, contents in site2news.items() if 10 < len(contents) < 1000}
site2words: dict[str, list[str]] = {} 
for site, contents in site2news.items():
    words = []
    for content in tqdm(contents, desc=f'Tokenizing {site}'):
        words.extend([word for sentence in tokenize_cn(content) for word in sentence])
    
    words = [word for word in words if word not in stop_words]
    site2words[site] = words

for site, words in site2words.items():
    plot_wordcloud(words, site)

# Word2Vec

#### 训练对照模型

中文

In [None]:
from models.dataset import CBOWDataSet
from models.cbow import CBOW
from models.word2vec import Word2Vec
from utils.plot import plot_curves

dataset = CBOWDataSet(
	  'data/cn/tokenized.txt', 
	  window_size=5, 
	  max_vocab=5_000
)
vocab = dataset.vocab
train_loader, test_loader = dataset.partition(
    batch_size=512,
    neg_size=16,
    ratio=0.9,
)
model = CBOW(len(vocab), 128)
word2vec = Word2Vec(model, vocab)
word2vec.train(train_loader, epochs=8, lr=1e-3, log_dir='logs/cn_baseline/')
word2vec.save('weights/cn_baseline/')
word2vec.test(test_loader)
plot_curves('logs/cn_baseline/', ['Training Loss', 'Training Accuracy'])

英文

In [None]:
from models.dataset import CBOWDataSet
from models.cbow import CBOW
from models.word2vec import Word2Vec
from utils.plot import plot_curves

# dataset = CBOWDataSet(
# 	  'data/en/tokenized.txt', 
# 	  window_size=5, 
# 	  max_vocab=5_000
# )
# vocab = dataset.vocab
# train_loader, test_loader = dataset.partition(
#     batch_size=512,
#     neg_size=16,
#     ratio=0.9,
# )
# model = CBOW(len(vocab), 128)
# word2vec = Word2Vec(model, vocab)
# word2vec.train(train_loader, epochs=8, lr=1e-3, log_dir='logs/en_baseline/')
# word2vec.save('weights/en_baseline/')
# word2vec.test(test_loader)
plot_curves('logs/en_baseline/', ['Training Loss', 'Training Accuracy'])

In [None]:
import random
from models.word2vec import Word2Vec

word2vec = Word2Vec.load('weights/cn_baseline/')
to_find_neighbors = ['主席', '暴雨', '党', '深化', '进口'] + random.sample(word2vec.vocab.vocab, 15)
for word in to_find_neighbors:
    print(f"Neighbors of {word}: {word2vec.nearest(word)}")

In [None]:
import random
from models.word2vec import Word2Vec

word2vec = Word2Vec.load('weights/en_baseline/')
to_find_neighbors = ['president', 'great', 'storm', 'deepen', 'import'] + random.sample(word2vec.vocab.vocab, 15)
for word in to_find_neighbors:
    print(f"Neighbors of {word}: {word2vec.nearest(word)}")

#### 是否使用log-sigmoid作为loss的影响

In [None]:
from models.cbow import CBOW
from models.word2vec import Word2Vec
from models.dataset import CBOWDataSet
from utils.plot import plot_curves

def test_logsigmoid(use_logsigmoid: bool):
    model = CBOW(len(vocab), 128)
    word2vec = Word2Vec(model, vocab)
    word2vec.train(train_loader, epochs=8, lr=1e-3, log_dir=f'logs/cn_logsigmoid/', use_logsigmoid=use_logsigmoid)
    word2vec.test(test_loader)
    word2vec.save(f'weights/cn_logsigmoid/{use_logsigmoid}/')

dataset = CBOWDataSet('data/cn/tokenized.txt', window_size=5, max_vocab=5_000)
vocab = dataset.vocab
train_loader, test_loader = dataset.partition(
    batch_size=512,
    neg_size=16,
    ratio=0.9,
)
for use_logsigmoid in [False]:
    test_logsigmoid(use_logsigmoid)
    
plot_curves('logs/cn_logsigmoid/', ['Training Accuracy', 'Training Loss'])

#### 不同词表大小的影响

In [None]:
from models.cbow import CBOW
from models.word2vec import Word2Vec
from models.dataset import CBOWDataSet
from utils.plot import plot_curves

def test_vocab_size(vocab_size: int):
    dataset = CBOWDataSet('data/cn/tokenized.txt', window_size=5, max_vocab=vocab_size)
    vocab = dataset.vocab
    train_loader, test_loader = dataset.partition(
        batch_size=512,
        neg_size=16,
        ratio=0.9,
    )
    model = CBOW(len(vocab), 128)
    word2vec = Word2Vec(model, vocab)
    word2vec.train(train_loader, epochs=8, lr=1e-3, log_dir=f'logs/cn_vocab_size/')
    word2vec.test(test_loader)
    word2vec.save(f'weights/cn_vocab_size/{vocab_size}/')
    
for vocab_size in [2_500, 5_000, 10_000, 20_000]:
    test_vocab_size(vocab_size)
    
plot_curves('logs/cn_vocab_size/', ['Training Loss', 'Training Accuracy'])

#### 不同窗口大小的影响

In [None]:
from models.cbow import CBOW
from models.word2vec import Word2Vec
from models.dataset import CBOWDataSet
from utils.plot import plot_curves

def test_window_size(window_size: int):
    dataset = CBOWDataSet('data/cn/tokenized.txt', window_size=window_size, max_vocab=5_000)
    vocab = dataset.vocab
    train_loader, test_loader = dataset.partition(
        batch_size=512,
        neg_size=16,
        ratio=0.9,
    )
    model = CBOW(len(vocab), 128)
    word2vec = Word2Vec(model, vocab)
    word2vec.train(train_loader, epochs=8, lr=1e-3, log_dir=f'logs/cn_window_size/')
    word2vec.test(test_loader)
    word2vec.save(f'weights/cn_window_size/{window_size}/')
    
for window_size in [1, 3, 5, 7]:
    test_window_size(window_size)
    
plot_curves('logs/cn_window_size/', ['Training Loss', 'Training Accuracy'])

#### 不同负采样大小的影响

In [None]:
from models.cbow import CBOW
from models.word2vec import Word2Vec
from models.dataset import CBOWDataSet
from utils.plot import plot_curves

def test_neg_size(neg_size: int):
    model = CBOW(len(vocab), 128)
    word2vec = Word2Vec(model, vocab)
    word2vec.train(train_loader, epochs=8, lr=1e-3, log_dir=f'logs/cn_neg_size/')
    word2vec.test(test_loader)
    word2vec.save(f'weights/cn_neg_size/{neg_size}/')

# dataset = CBOWDataSet('data/cn/tokenized.txt', window_size=5, max_vocab=5_000)
vocab = dataset.vocab
train_loader, test_loader = dataset.partition(
    batch_size=512,
    neg_size=16,
    ratio=0.9,
)
for neg_size in [4, 8, 16, 32]:
    test_neg_size(neg_size)
    
plot_curves('logs/cn_neg_size/', ['Training Loss', 'Training Accuracy'])