# Scraping: Web to Excel(東京リカレントナビ)

<div class="alert alert-block alert-danger">
<b>Notice:</b> Webサイトからスクレイピングで情報を収集するため <b>過度な呼び出しは厳禁</b>
</div>

- https://www.recurrent-navi.metro.tokyo.lg.jp/organizer_tag/tokyoto

## 0. 事前準備

### 共通処理/定数定義

In [None]:
from mylib.MyBanner import MyBanner

SLEEP_SEC = 0.500

#BASE_URL = "https://www.recurrent-navi.metro.tokyo.lg.jp/organizer_tag/tokyoto/page/%i"
BASE_URL = "https://www.recurrent-navi.metro.tokyo.lg.jp/course/all/page/%i?s&post_type=course"

PAGE_NUM = 10
EXCEL_FILE = '../data/recurrent_navi_tyo.xlsx'
EXCEL_SHEET = 'my_sheet'

### パッケージインストール

In [None]:
MyBanner.start()

!python -V
!pip install pandas
!pip install openpyxl

MyBanner.finish()

### import

In [None]:
MyBanner.start()

import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

MyBanner.finish()

## 1. データ収集 & ファイル保存

### Crawling & Scraping

In [None]:
MyBanner.start()

def crawl_article_list(base_url, max_page):
    rows = []
    for idx in range(1, (PAGE_NUM + 1)):
        # create a url
        list_url = base_url % idx
        # request a document with http
        response = requests.get(list_url, headers={'User-Agent': 'hoge'})
        print(f"{response=}: {list_url=}")
        soup = BeautifulSoup(response.text, "html.parser")
        articles = soup.select("article.p-course-card")
        for article in articles:
            # print(f"//////////////////////////////////////////////////\n{article=}")
            title = article.find('h2')
            if title is None:
                title = article.find('h3')
                if title is None:
                    break
            title = title.text.strip()
            sdate = article.find('time')
            if not sdate is None:
                sdate = sdate.attrs['datetime']
            url = article.find('a').attrs['href']
            sid = url.split("/")[-1]
            categories = article.select('div.p-course-type > span')
            category = []
            for item in categories:
                category.append(item.text.strip())
            category = ','.join(category)
            summary = article.select('div.p-course-card__desc > p')[0].text.strip()
            cols = {
                "id": sid,
                "category": category,
                "date": sdate,
                "title": title,
                "url": url,
                "summary": summary,
            }            
            items = article.select('div.p-course-card__data > dl')
            for item in items:
                cols[item.find('dt').text.strip()] = item.find('dd').text.strip()
            rows.append(cols)
        time.sleep(SLEEP_SEC)
    return rows

rows = crawl_article_list(BASE_URL, PAGE_NUM)
crawl_df = pd.DataFrame(rows)
crawl_df.set_index("id", inplace=True)

print(f"{len(rows)=}")
crawl_df.info()

MyBanner.finish()

### Cleansing

In [None]:
MyBanner.start()

def fill_summary_text(df):
    for index, row in df.iterrows():
        lastchr = row['summary'][-1]
        if lastchr == "…":
            print(f"{lastchr=}")
            url = row['url']
            response = requests.get(url, headers={'User-Agent': 'hoge'})
            print(f"{response=}: {url=}")
            soup = BeautifulSoup(response.text, "html.parser")
            summary = soup.select("div.p-course-single-card__text > p")[0].text.strip()
            print(f"{summary=}")
            df.at[index, 'summary'] = summary
            time.sleep(SLEEP_SEC)

    return df

creaned_df = fill_summary_text(crawl_df)
creaned_df.info()

creaned_df.to_excel(EXCEL_FILE, sheet_name=EXCEL_SHEET)
print(f"{EXCEL_FILE=}")

MyBanner.finish()