# Scraping: Web to VDB (YJ Topics)

<div class="alert alert-block alert-danger">
<b>Notice:</b> Webサイトからスクレイピングで情報を収集するため <b>過度な呼び出しは厳禁</b>
</div>

## 0. 事前準備

### 共通処理/定数定義

In [None]:
import myconstant
from mylib.MyBanner import MyBanner

In [None]:
RSS_LIST = {
    'top-topics': 'https://news.yahoo.co.jp/rss/topics/top-picks.xml',  # 主要
    'business': 'https://news.yahoo.co.jp/rss/topics/business.xml',  # 経済
    'entertainment': 'https://news.yahoo.co.jp/rss/topics/entertainment.xml',  # エンタメ
    'sports': 'https://news.yahoo.co.jp/rss/topics/sports.xml',  # スポーツ
#    'domestic': 'https://news.yahoo.co.jp/rss/topics/domestic.xml',  # 国内
#    'world': 'https://news.yahoo.co.jp/rss/topics/world.xml',  # 国際
    'it': 'https://news.yahoo.co.jp/rss/topics/it.xml',  # IT
#    'science': 'https://news.yahoo.co.jp/rss/topics/science.xml',  # 科学
}

COLLECTION_NAME = 'yj_topics_data'

### パッケージインストール

In [None]:
MyBanner.start()

!pip install feedparser
!pip install langchain_community
!pip install tiktoken 

MyBanner.finish()

### import

In [None]:
MyBanner.start()

import feedparser
from langchain_community.document_loaders import WebBaseLoader
import time
from langchain.text_splitter import CharacterTextSplitter

MyBanner.finish()

## 1. データ収集

### Crawling & Scraping

In [None]:
MyBanner.start()

SLEEP_SEC = 0.25

# gather the detail page urls from each rss of topics
page_list = {}
for key in RSS_LIST.keys():
    rss_url = RSS_LIST[key]
    print(f"{key=}: {rss_url=}")
    rss = feedparser.parse(rss_url)
    pages = []
    for entry in rss.entries:
        title = entry.title
        flg = hasattr(entry, 'comments')
        pg_url = None
        if flg == True:
            pg_url = entry.comments.removesuffix('/comments')
        print(title, pg_url)
        if flg == True:
            pages.append(pg_url)
    page_list[key] = pages
    time.sleep(SLEEP_SEC)

print(f"{page_list=}" + "\n" + ("/" * 40))

# scrape the detail pages
docs = []
for key in page_list.keys():
    pages = page_list[key]
    for pg_url in pages:
        doc = WebBaseLoader(pg_url).load()
        docs += doc
        time.sleep(SLEEP_SEC)

print(f"{len(docs)=}" + "\n" + ("/" * 40))
print(f"{docs[:2]=}")

MyBanner.finish()

### Chunking

In [None]:
MyBanner.start()

# Chunk into 1,000 character chunks with newlines as delimiters
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator = "\n",
    chunk_size= 1000,
    chunk_overlap=100,
)
doc_splits = text_splitter.split_documents(docs)
docs = doc_splits

# Chunk into 1,000 character chunks with a full-width space as delimiters
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator = "　",
    chunk_size= 1000,
    chunk_overlap=100,
)
doc_splits = text_splitter.split_documents(docs)
docs = doc_splits

print(f"{len(docs)=}" + "\n" + ("/" * 40))
print(f"{doc_splits[:5]=}")

MyBanner.finish()

## 2. ベクトル化 (Vectorization)

### Embedding生成

In [None]:
MyBanner.start()
from mylib.MyEmbedding import MyEmbedding

embeddings = MyEmbedding.get_model()
print(f"{embeddings=}")

MyBanner.finish()

### EmbeddingとVDB挿入

In [None]:
MyBanner.start()

from mylib.MyMilvus import MyMilvus

milvus = MyMilvus(\
    myconstant.VDB_HOST, myconstant.VDB_PORT,\
    myconstant.VDB_USER, myconstant.VDB_PASS, embeddings)
print(f"{milvus=}")

# Insert into Vector DB while embedding immediately after connecting to it
vector_db = milvus.from_documents(docs, COLLECTION_NAME)
print(f"{len(docs)=}")
print(f"{COLLECTION_NAME=}")
print(f"{vector_db=}")

MyBanner.finish()