In [1]:
%pip install langchain langchain-community boto3 tqdm faiss-cpu fastwarc bs4 sentence-transformers jieba zh-sentence stanfordnlp nltk

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [2]:
import os
import io
from pathlib import Path

import boto3
import re
import jieba
import faiss
import chardet
import nltk
import multiprocessing
from zh_sentence.tokenizer import tokenize
from botocore.client import Config
from tqdm import tqdm
from bs4 import BeautifulSoup
from langchain_core.documents.base import Document

from langchain_community.document_loaders import BSHTMLLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS

from fastwarc.stream_io import FileStream, GZipStream
from fastwarc.warc import ArchiveIterator, WarcRecordType

from langdetect import detect
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import BertModel, BertTokenizer
import torch
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import faiss
import numpy as np
import stanfordnlp



In [3]:
# Initialize NLP pipelines
# stanfordnlp.download('zh')
# stanfordnlp.download('en')
nlp_zh = stanfordnlp.Pipeline(lang="zh", processors="tokenize")
nlp_en = stanfordnlp.Pipeline(lang="en", processors="tokenize")

# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

nltk.download('wordnet')

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/home/jupyter/stanfordnlp_resources/zh_gsd_models/zh_gsd_tokenizer.pt', 'lang': 'zh', 'shorthand': 'zh_gsd', 'mode': 'predict'}
Done loading processors!
---
Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/home/jupyter/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Done loading processors!
---


[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# bucket path
bucket_name = "china-warc-archieves"

# bucket creds
access_id = " "
access_secret = " "

session = boto3.session.Session()

s3 = session.client(
    service_name="s3",
    endpoint_url="https://storage.yandexcloud.net",
    aws_access_key_id=access_id,
    aws_secret_access_key=access_secret,
)

def list_files(bucket_name, s3_client, prefix = ""):
    for key in s3.list_objects(Bucket=bucket_name)["Contents"]:
        if key["Key"].endswith(".warc.gz"):
            yield key["Key"]
        continue


files = list(list_files(bucket_name = bucket_name, s3_client = s3))
print(files)

['cn.reuters.com-2017-01.warc.gz', 'cn.reuters.com-2017-02.warc.gz', 'cn.reuters.com-2017-03.warc.gz', 'cn.reuters.com-2017-04.warc.gz', 'finance.people.com.cn-2019-08.warc.gz', 'finance.people.com.cn-2019-09.warc.gz', 'finance.people.com.cn-2022-04.warc.gz', 'finance.people.com.cn-2022-05.warc.gz', 'finance.people.com.cn-2023-04.warc.gz', 'finance.people.com.cn-2023-06.warc.gz', 'finance.people.com.cn-2023-07.warc.gz', 'health.people.com.cn-2019-08.warc.gz', 'health.people.com.cn-2019-09.warc.gz', 'health.people.com.cn-2023-04.warc.gz', 'health.people.com.cn-2023-05.warc.gz', 'health.people.com.cn-2023-07.warc.gz', 'inews.hket.com-2022-04.warc.gz', 'inews.hket.com-2022-05.warc.gz', 'inews.hket.com-2022-06.warc.gz', 'inews.hket.com-2023-03.warc.gz', 'inews.hket.com-2023-06.warc.gz', 'inews.hket.com-2023-07.warc.gz', 'news.ltn.com.tw-2016-08.warc.gz', 'news.ltn.com.tw-2016-09.warc.gz', 'news.ltn.com.tw-2016-12.warc.gz', 'news.ltn.com.tw-2017-02.warc.gz', 'news.ltn.com.tw-2017-03.warc.gz

In [5]:
# Function to segment Chinese text
def segment_chinese(text):
    sentences = nlp_zh(text)
    segmented_sentences = ["".join([token.text for token in sentence.tokens]) for sentence in sentences.sentences]
    return segmented_sentences

# Function to preprocess English text
def segment_english(text):
    sentences = nlp_en(text)
    segmented_sentences = [" ".join([lemmatizer.lemmatize(token.text) for token in sentence.tokens
                                     if token.text not in stop_words]) for sentence in sentences.sentences]
    return segmented_sentences

# Function to clean and normalize text
def clean_text(text):
    text = re.sub(r"[^\w\s\u4e00-\u9fff]", "", text)  # Keep alphanumeric and Chinese characters
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Function to preprocess content based on detected language
def simple_preprocessing(content):
    try:
        language = detect(content)
        if language in {'en'}:
            sentences = segment_english(content)
        else:
            sentences = segment_chinese(content)
    except Exception as ex:
        print(ex)
        sentences = segment_chinese(content)
        
    cleaned_sentences = [clean_text(sentence) for sentence in sentences]
    return cleaned_sentences

In [6]:
def encode_raw_content(encoding, raw_content):
    encoding = encoding if encoding else chardet.detect(raw_content)["encoding"]
    html_content = (
        raw_content.decode(encoding if encoding else "utf-8", errors="ignore")
        .encode("utf-8", errors="ignore")
        .decode("utf-8", errors="ignore")
    )
    return html_content

def retrieve_main_text_info(text):
    soup = BeautifulSoup(text, "html.parser")
    content = soup.find_all(['h1', 'p'])
    text = [item.text for item in content]
    content = ' '.join(text)
    return content

def extract_html_from_warc(warc_file_path):
    response = s3.get_object(Bucket=bucket_name, Key=warc_file_path)
    server_stream = response
    archieve_stream = GZipStream(io.BytesIO(server_stream["Body"].read()))
    
    docs_transformed = []
    sentences = []
    
    for record in ArchiveIterator(archieve_stream):
        if not record.headers.get(
            "Content-Type"
        ) == "application/http; msgtype=response" and not "text/html" in record.http_headers.get(
            "Content-Type", ""
        ):
            continue

        payload = record.reader.read()

        html_content = encode_raw_content(record.http_charset, payload)
        year, month = record.record_date.year, record.record_date.month
        content = retrieve_main_text_info(html_content)
        print(warc_file_path, content[:12], sep = ": ", end = "\n")
        
        content = content[64:len(content) - 256]
        try:
            sentences.append(simple_preprocessing(content))
            docs_transformed.append(content)
        except Exception as ex:
            print(ex, content)
            continue
        
    return docs_transformed, sentences

In [7]:
# Initialize models and variables
vectorizer = TfidfVectorizer(max_features=10000)
bert_model = BertModel.from_pretrained('bert-base-chinese')
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
word2vec_model = None

# Initialize FAISS index
d = 768  # Dimension of BERT embeddings
nlist = 50  # Number of clusters
quantizer = faiss.IndexFlatL2(d)
faiss_index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)

# Train FAISS index with dummy data to initialize (Replace with actual data)
faiss_index.train(np.zeros((nlist * 50, d), dtype=np.float32))

n_clusters = nlist  # Choose the number of clusters based on your dataset size
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Global mapping of FAISS indices to documents
global_doc_mapping = {}
current_index = 0

In [8]:
# Function to get BERT embeddings
def get_bert_embedding(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Function to add embeddings to FAISS index
def add_embeddings_to_faiss_index(new_embeddings, index):
    embeddings_list = []
    ids_list = []
    for cluster_id, embeddings in new_embeddings.items():
        for embedding in embeddings:
            embeddings_list.append(embedding)
            ids_list.append(cluster_id)
    if embeddings_list:
        embeddings_np = np.vstack(embeddings_list).astype(np.float32)
        ids_np = np.array(ids_list).astype(np.int64)
        index.add_with_ids(embeddings_np, ids_np)

files = ["finance.people.com.cn-2022-05.warc.gz"]

for archive in files:
    docs_transformed, sentences = extract_html_from_warc(archive)

    # TF-IDF vectorization and clustering
    tfidf_matrix = vectorizer.fit_transform(docs_transformed)
    kmeans.fit(tfidf_matrix)
    clusters = kmeans.labels_

    clustered_docs = {i: [] for i in range(n_clusters)}
    for idx, label in enumerate(clusters):
        clustered_docs[label].append(docs_transformed[idx])

    # Generate word2vec embeddings for each cluster
    cluster_embeddings = {}
    for cluster_id, docs in clustered_docs.items():
        cluster_embeddings[cluster_id] = [get_bert_embedding(doc, bert_model, tokenizer) for doc in docs]

    # Update FAISS index and save current index variable for next archives
    current_index = add_embeddings_to_faiss_index(cluster_embeddings, faiss_index)

    # Save FAISS index after processing each archive to ensure progress is not lost
    faiss.write_index(faiss_index, "faiss_index.index")

finance.people.com.cn-2022-05.warc.gz:  “五一”特别策划：长大
finance.people.com.cn-2022-05.warc.gz:  项目开足马力 纾困小微
finance.people.com.cn-2022-05.warc.gz:  卫星看中国：减碳节能的
finance.people.com.cn-2022-05.warc.gz:  稳就业促就业是献给劳动
finance.people.com.cn-2022-05.warc.gz:  中国上市公司协会：20
finance.people.com.cn-2022-05.warc.gz:  农产品消费市场迎高峰 
finance.people.com.cn-2022-05.warc.gz:  新修订的职业教育法今日
finance.people.com.cn-2022-05.warc.gz:  期盼文旅业脱困“蝶变”
finance.people.com.cn-2022-05.warc.gz:  经济日报社论：新征程的
finance.people.com.cn-2022-05.warc.gz:  数说中国丨4月份制造业
finance.people.com.cn-2022-05.warc.gz:  落实防疫举措 五一期间
finance.people.com.cn-2022-05.warc.gz:  五一劳动者说：致敬平凡
finance.people.com.cn-2022-05.warc.gz:  潘家园社区副书记：哪怕
finance.people.com.cn-2022-05.warc.gz:  火箭造型师：新时代的追
finance.people.com.cn-2022-05.warc.gz:  焦点访谈：防住疫情 稳
finance.people.com.cn-2022-05.warc.gz:  国家税务总局：持续加快
finance.people.com.cn-2022-05.warc.gz:  “五一”假期首日全国铁
finance.people.com.cn-2022-05.warc.gz:  蜂产品如何生产加工？市
finance.people.com.cn-2022-05.warc.gz:  10家行业协会联合14
finance.peop



In [18]:
# Function to determine the cluster for a query
def get_query_cluster(query, vectorizer, kmeans):
    query_tfidf = vectorizer.transform([query])
    cluster = kmeans.predict(query_tfidf)
    return cluster[0]

# Function to search FAISS index
def search_faiss_index(query_embedding, index, k=5):
    D, I = index.search(query_embedding, k)
    return I[0]

# Retrieve context for user's query
query = "中国工业"  # Replace with actual user query
query_cluster = get_query_cluster(query, vectorizer, kmeans)
query_embedding = get_bert_embedding(query, bert_model, tokenizer).astype(np.float32)
results = search_faiss_index(query_embedding, faiss_index)

print("Top documents for the query:", results)

Top documents for the query: [ 9  9 16  9 34]


In [20]:
sentences[34]

['桥建成了交通大国正在加快建设交通强国我们坚持创新引领高铁大飞机等装备制造实现重大突破新能源汽车占全球总量一半以上港珠澳大桥北京大兴国际机场等超大型交通工程建成投运交通成为中国现代化的开路先锋党的十八大以来在以习近平同志为核心的党中央坚强领导下我国交通运输事业取得辉煌成就交通运输体系不断完善服务质量持续提高为经济社会高质量发展提供了坚实支撑纵横神州联通内外综合立体交通网日趋完善掘进再掘进3月9日全长2213公里的世界最长在建高速公路隧道新疆乌尉公路天山胜利隧道中导洞累计掘进突破万米大关为日后贯通打下坚实基础总投资约761亿元的乌尉公路包项目既能拉动经济促进就业又能打通一条连接南北疆的高速路新通道为当地发展带来新契机延伸再延伸',
 '3月17日我国首条跨海高铁福厦高铁开始铺轨项目全面进入线上施工阶段作为我国八纵八横高速铁路网沿海大通道中的重要组成部分正线全长27742公里的福厦高铁建成通车后将使福州厦门泉州等城市步入一小时生活圈我国稳步推进交通基础设施建设综合立体交通网的规模体量覆盖广度和通达深度不断提升',
 '截至2021年底全国铁路营业总里程突破15万公里公路总里程约528万公里港口拥有生产用码头泊位超2万个内河航道通航里程达128万公里境内运输机场达248个快递进村比例超过80今日中国拥有全球最大的高速铁路网高速公路网世界级港口群航空航海通达全球综合交通网突破600万公里人享其行物畅其流运输服务保障能力更加强大高铁通了高速多了柏油路修到家门口山沟沟也走上了发展快车道说起近年来交通的变化',
 '贵州盘州市淤泥彝族乡岩博联村党委书记余留芬深有感触依托四通八达的交通网络当地特色农产品卖到了大江南北智能家电海鲜水产走进了农家日常生活交通先行一通百通经济发展民生改善离不开交通先行交通骨架的持续完善',
 '缩短了神州大地的时空距离提升着城乡居民的生活水平',
 '物流运输更加高效畅通水路国际运输航线覆盖100多个国家和地区中欧班列累计开行超5万列快递单日处理量最高接近7亿件集装箱铁水联运量5年平均增长20以上我国已经是世界上运输最繁忙的国家之一铁路公路水运民航货物周转量港口货物吞吐量邮政快递业务量等指标位居世界第一或跻身世界前列居民出行更加便捷舒适动车组列车承担铁路客运量比例升至70日行千里不是梦全国拥有公共汽电车约71万辆城市轨道交通运营里程约8730公里超特大