<a href="https://colab.research.google.com/github/ShutaShimazaki/Sakae-Spring/blob/master/sakae_spring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# フェス参加アーティストのレーベル情報を取得する
## 動作


1.   フェスHPをスクレイピングし、アーティスト名とSpotifyリンク(artistID)を取得
2.   artist id →API"Get Artist'sAlbum"から1album
3.   API"Get Album"からlabels

## Spotify APIについて


*   準備：https://takagi.blog/get-and-refresh-access-tokens-for-the-spotify-web-api/

*   トークン設定：
  *   export SPOTIFY_CLIENT_ID='your_client_id_here'
  *   export SPOTIFY_CLIENT_SECRET='your_client_secret_here'
  *   export SPOTIFY_REFRESH_TOKEN='your_refresh_token_here'











In [7]:
!pip install requests beautifulsoup4
!pip install openpyxl



In [8]:
# 関数インポート準備1: Google Driveをマウント
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
# 前準備2: 現在のmodules.ipynbをmodules.pyに変換
!jupyter nbconvert --to python '/content/drive/MyDrive/Colab Notebooks/modules/modules.ipynb'

[NbConvertApp] Converting notebook /content/drive/MyDrive/Colab Notebooks/modules/modules.ipynb to python
[NbConvertApp] Writing 2741 bytes to /content/drive/MyDrive/Colab Notebooks/modules/modules.py


In [10]:
#前準備3
#変換された.pyファイルをインポート
import sys
# 変換された.pyファイルがあるディレクトリをパスに追加
sys.path.append('/content/drive/MyDrive/Colab Notebooks/modules')

# modules.pyから関数をインポート
from modules import is_label, is_famous_artist

In [11]:
import os
import time
import requests
import base64
from google.colab import userdata
import pandas as pd
from bs4 import BeautifulSoup
from openpyxl import Workbook
from openpyxl.styles import Font
from openpyxl.utils.dataframe import dataframe_to_rows

# スクレイピング対象のURL
base_url = 'https://sakaespring.com'
url = f'{base_url}/lineup/6.1'

# 環境変数から認証情報を取得
CLIENT_ID = userdata.get('SPOTIFY_CLIENT_ID')
CLIENT_SECRET = userdata.get('SPOTIFY_CLIENT_SECRET')
REFRESH_TOKEN = userdata.get('SPOTIFY_REFRESH_TOKEN')

def refresh_access_token(refresh_token, client_id, client_secret):
    """
    refresh_tokenを使用して新しいaccess_tokenを取得する
    """
    url = "https://accounts.spotify.com/api/token"
    headers = {
        "Authorization": "Basic " + base64.b64encode(f"{client_id}:{client_secret}".encode()).decode(),
    }
    data = {
        "grant_type": "refresh_token",
        "refresh_token": refresh_token,
    }

    response = requests.post(url, headers=headers, data=data)
    response_data = response.json()

    return response_data.get("access_token")


def fetch_artist_spotify_links(url):
    session = requests.Session()
    response = session.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')
    artist_cards = soup.find_all('div', class_='fespli-card')
    artist_info = []

    for card in artist_cards:
        artist_name = card.find('div', class_='fespli-card__title').text.strip()
        detail_link = base_url + card.find('a', class_='fespli-card__link')['href']
        detail_response = session.get(detail_link)
        detail_soup = BeautifulSoup(detail_response.text, 'html.parser')

        spotify_link_element = detail_soup.find('a', href=lambda href: href and "https://open.spotify.com/artist/" in href)
        if spotify_link_element:
            artist_spotify_link = spotify_link_element['href']
            artist_info.append((artist_name, artist_spotify_link))

    return artist_info

def extract_spotify_id(spotify_link):
    """SpotifyリンクからSpotify IDを抽出する"""
    if "?" in spotify_link:
        return spotify_link.split('artist/')[1].split('?')[0]
    else:
        return spotify_link.split('artist/')[1]

def get_latest_album_id(artist_id, access_token):
    """特定のアーティストの最新アルバムのSpotify IDを取得する"""
    url = f'https://api.spotify.com/v1/artists/{artist_id}/albums'
    headers = {
        'Authorization': f'Bearer {access_token}'
    }
    params = {
        'market': 'US',
        'limit': 1  # 最新のアルバム（シングル）1つだけを取得
    }

    response = requests.get(url, headers=headers, params=params)
    response.raise_for_status()  # エラー時に例外を発生させる

    albums_data = response.json()
    return albums_data['items'][0]['id']  # 最新のアルバムのIDを返す

def get_album_label(album_id, access_token):
    """アルバムIDに基づいてアルバムのレーベル情報を取得する"""
    url = f'https://api.spotify.com/v1/albums/{album_id}'
    headers = {
        'Authorization': f'Bearer {access_token}'
    }

    response = requests.get(url, headers=headers)
    response.raise_for_status()  # エラー時に例外を発生させる

    album_info = response.json()
    return album_info['label']  # アルバムのレーベル情報を返す

def main():
    access_token = refresh_access_token(REFRESH_TOKEN, CLIENT_ID, CLIENT_SECRET)
    artist_info = fetch_artist_spotify_links(url)
    results = []  # 結果を保存するためのリスト

    for artist_name, spotify_link in artist_info:
        time.sleep(1)  # サーバーに負荷をかけない
        artist_id = extract_spotify_id(spotify_link)
        try:
            album_id = get_latest_album_id(artist_id, access_token)
            album_label = get_album_label(album_id, access_token)
            is_in_label = "Yes" if is_label(album_label) else "No"
        except Exception as e:
            print(f"Error processing artist {artist_name}: {e}")
            album_label = "Error: Failed to fetch album info"
            is_in_label = "Error"

        results.append([artist_name, album_label, is_in_label, spotify_link])

    # pandas DataFrameを使用して結果を整理
    df = pd.DataFrame(results, columns=['Artist Name', 'Album Label', 'Is In Label', 'Spotify Link'])
    df_indie = df[df['Is In Label'] == "No"]  # "Is In Label"が"No"のデータのみ抽出

    # 新規Workbookを作成し、全アーティスト情報を"Artists"シートに書き込む
    wb = Workbook()
    ws1 = wb.active
    ws1.title = "Artists"
    for r in dataframe_to_rows(df, index=False, header=True):
        ws1.append(r)
    for cell in ws1['D'][1:]:  # D列（Spotify Link）にハイパーリンクを設定
        cell.hyperlink = cell.value
        cell.value = "Spotify Link"
        cell.font = Font(color='0000FF', underline='single')

    # "Indie Artists"シートを作成し、インディーズアーティスト情報を書き込む
    ws2 = wb.create_sheet(title="Indie Artists")
    for r in dataframe_to_rows(df_indie, index=False, header=True):
        ws2.append(r)
    for cell in ws2['D'][1:]:  # D列（Spotify Link）にハイパーリンクを設定
        cell.hyperlink = cell.value
        cell.value = "Spotify Link"
        cell.font = Font(color='0000FF', underline='single')

    # Excelファイルに保存
    output_dir = "./drive/MyDrive/New_artists/Excels/"
    wb.save(output_dir+'artists.xlsx')

if __name__ == '__main__':
    main()

Error processing artist Kaco: 404 Client Error: Not Found for url: https://api.spotify.com/v1/artists/5uWUJPAFWreFPks7f3wtMx//albums?market=US&limit=1


KeyboardInterrupt: 