###　　神社情報スクレイピグ

In [1]:
# requests機能をインポートします。
#アクセス先をREQUEST_URLを代入します。
#リクエストしたデータをresに代入します。
# resの文字データがISO-8859-1なので、utf-8に変換して文字化けを防止します。
import requests, os, time, json
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
import urllib.parse
# ここから張機能をインポート

ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
headers = {"User-Agent": ua}

REQUEST_URL = 'https://en.wikipedia.org/wiki/List_of_Shinto_shrines_in_Japan'


In [2]:
# Create the URL for the current page
url = REQUEST_URL

# Send an HTTP request to the page
res = requests.get(url, headers=headers)
res.encoding = "utf-8"

soup = BeautifulSoup(res.text, 'html.parser')

href_list = []
title_list = []

# Iterate through each <a> tag
for a_tag in soup.find_all('a'):
    # Extract href and title attributes
    href_attribute = a_tag.get('href')
    title_attribute = a_tag.get('title')

    # Append to lists
    href_list.append(href_attribute)
    title_list.append(title_attribute)

# Create a dataframe
df = pd.DataFrame({'Href': href_list, 'Title': title_list})

df['Latitude'] = None
df['Longitude'] = None

# Filter rows containing "神" or "宮" or "稲荷" or "社" in the "Title" column

# Print or use the filtered DataFrame as needed
df
df.to_csv("Shrine_en_df.csv",index=True)

In [3]:
for index, row in df.iterrows():
    href_url = 'https://en.wikipedia.org' + row['Href']
    attempt_count = 0
    
    print(f"Processing {href_url}...")  # 処理開始をログ出力
    
    while attempt_count < 1:
        try:
            # URLにリクエストを送る
            response = requests.get(href_url, headers=headers)
            response.raise_for_status()  # 不正なレスポンスに対してHTTPErrorを発生させる

            print("Response received successfully.")

            # HTMLコンテンツを解析
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # 緯度と経度を抽出（もし利用可能なら）
            latitude_element = soup.find('span', {'class': 'latitude'})
            longitude_element = soup.find('span', {'class': 'longitude'})
            
            # DataFrameを緯度と経度で更新
            if latitude_element is not None:
                df.at[index, 'Latitude'] = latitude_element.text

            if longitude_element is not None:
                df.at[index, 'Longitude'] = longitude_element.text

            # すべての<p>要素を抽出して'Content'カラムに追加
            paragraphs = soup.find_all('p')
            content = ' '.join([para.get_text() for para in paragraphs])
            df.at[index, 'Content'] = content

            # 緯度と経度を含むDataFrameを表示
            print(df[['Href', 'Title', 'Latitude', 'Longitude', 'Content']])
            break

        except requests.exceptions.HTTPError as errh:
            print("HTTP Error:", errh)
        except requests.exceptions.ConnectionError as errc:
            print("Error Connecting:", errc)
            time.sleep(5)  # 5秒待ってからリトライ
        except requests.exceptions.Timeout as errt:
            print("Timeout Error:", errt)
        except requests.exceptions.RequestException as err:
            print("Error:", err)
        
        attempt_count += 1
        print(f"Attempt {attempt_count} failed. Retrying...")  # リトライをログ出力

    if attempt_count == 1:
        print("Max retries reached for URL:", href_url)

    print(f"Finished processing {href_url}.")  # 処理終了をログ出力

df

Processing https://en.wikipedia.org#bodyContent...
Response received successfully.
                                                   Href  \
0                                          #bodyContent   
1                                       /wiki/Main_Page   
2                              /wiki/Wikipedia:Contents   
3                           /wiki/Portal:Current_events   
4                                  /wiki/Special:Random   
...                                                 ...   
1066     https://stats.wikimedia.org/#/en.wikipedia.org   
1067  https://foundation.wikimedia.org/wiki/Special:...   
1068  //en.m.wikipedia.org/w/index.php?title=List_of...   
1069                   https://wikimediafoundation.org/   
1070                         https://www.mediawiki.org/   

                                      Title Latitude Longitude  \
0                                      None     None      None   
1                   Visit the main page [z]     None      None   
2         

Unnamed: 0,Href,Title,Latitude,Longitude,Content
0,#bodyContent,,,,Fleetwood Park was a 19th-century American har...
1,/wiki/Main_Page,Visit the main page [z],,,Fleetwood Park was a 19th-century American har...
2,/wiki/Wikipedia:Contents,Guides to browsing Wikipedia,,,\n Explore the vast knowledge of Wikipedia thr...
3,/wiki/Portal:Current_events,Articles related to current events,,,Edit instructions\n Armed conflicts and attack...
4,/wiki/Special:Random,Visit a randomly selected article [x],,,\n Paramiella kondoi is a species of a land sn...
...,...,...,...,...,...
1066,https://stats.wikimedia.org/#/en.wikipedia.org,,,,
1067,https://foundation.wikimedia.org/wiki/Special:...,,,,
1068,//en.m.wikipedia.org/w/index.php?title=List_of...,,,,
1069,https://wikimediafoundation.org/,,,,


In [4]:
import sqlite3
import pandas as pd

# データベースファイルへの接続を作成します。ファイルが存在しない場合は新しく作成されます。
conn = sqlite3.connect('trial.db')

# filtered_df を 'temples_table' という名前のテーブルに保存します。
# if_exists='replace' は、もしテーブルが既に存在する場合は置き換えることを意味します。
# index=False は、DataFrame のインデックスをテーブルに保存しないことを意味します。
df.to_sql('temples_eng_table', conn, if_exists='replace', index=False)

# 接続を閉じます。
conn.close()

In [5]:
# location_df = filtered_df.dropna(subset=['Latitude']).reset_index(drop=True)

# # Function to convert Japanese coordinates to universal coordinates
# def convert_japanese_coordinates(japanese_coordinate):
#     print(f"Processing: {japanese_coordinate}")
#     cleaned_coordinate = (
#         japanese_coordinate
#         .replace('北緯', '')
#         .replace('南緯', '-')
#         .replace('東経', '')
#         .replace('度', '.')
#         .replace('分', '')
#         .replace('秒', '')
#     )
#     components = [float(comp) for comp in cleaned_coordinate.split('.')]
    
#     print(f"Components: {components}")
    
#     decimal_degrees = components[0] + components[1] / 60 + components[2] / 3600
#     return decimal_degrees


# # Apply the conversion function to the "Latitude" and "Longitude" columns
# location_df['Latitude'] = location_df['Latitude'].apply(convert_japanese_coordinates)
# location_df['Longitude'] = location_df['Longitude'].apply(convert_japanese_coordinates)


# # Print or use the filtered DataFrame as needed
# location_df