<a href="https://colab.research.google.com/github/SEKI-YUTA/ZARD_Information_Collect/blob/main/ZARD%E7%94%BB%E5%83%8F%E3%81%A8%E3%82%A2%E3%83%AB%E3%83%90%E3%83%A0%E6%83%85%E5%A0%B1%E3%81%AE%E5%8F%8E%E9%9B%86.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# スクレイピングに必要なパッケージをインストール
!pip install requests beautifulsoup4



In [None]:
# Googleドライブにマウント
from google.colab import drive
drive.mount('/content/drive')

zardFolderPath = "drive/MyDrive/ZARD/"
jacketFolderName = "jacket"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from os import name
from dataclasses import dataclass, field, asdict
from typing import List

@dataclass
class Track:
  trackName: str
  lyrics: str
  composition: str
  arrangement: str
  year: str

@dataclass
class Disc:
  id: int
  name: str
  releaseYear: str
  releaseMonth: str
  releaseDate: str
  indexStr: str
  discType: str
  is8cm: bool
  image: str
  trackList: List[Track]
  officialPageURL: str

@dataclass
class Artist:
  releasedDiscs: List[Disc]
  artistName: str = "ZARD"
  debutYear: str = "1991"
  debutMonth: str = "2"
  debutDate: str = "10"


@dataclass
class FileNameArguments:
  """
  generate jacket image filename from time and backNumber, index

  timeStr
  ex: 1991_02_10

  backNumber
  ex: 1st_single

  index
  ex: 1
  this index starts from one. this argument is optional.
  no value passed when target is not cd ex: book.

  title
  ex: 永遠〜君と僕との間に〜
  this value is optional.
  no value passed when target is cd.
  """
  timeStr: str
  backNumberStr: str
  indexStr: str
  title: str


In [None]:
import datetime
import os


# 現在日時の文字列を返す
# jsonを書き出す際に使用している
def getTimeStr():
  dt_now = datetime.datetime.now()
  # print(dt_now)
  time = dt_now.strftime('%Y年%m月%d日_%H:%M:%S')
  # print(time)
  return time

def replaceCharForJson(str):
  return str.replace("\'", "\"").replace("True", "true").replace("False", "false")


# 画像ファイルの名前を生成する関数
def generate_jacket_image_name(fileNameArg: FileNameArguments) -> str:
    if fileNameArg.indexStr == "":
        return f"{fileNameArg.timeStr}_{fileNameArg.backNumberStr}_{fileNameArg.title}.jpg"
    else:
        return f"{fileNameArg.timeStr}_{fileNameArg.backNumberStr}_{fileNameArg.indexStr}.jpg"


# 現在時刻でフォルダを作成する関数
def create_folder():
  try:
      # フォルダを作成する
      os.makedirs(zardFolderPath + jacketFolderName + "/" + folderName)
      print(f"Folder '{folderName}' created successfully.")
  except FileExistsError:
      # フォルダがすでに存在する場合
      print(f"Folder '{folderName}' already exists.")



In [None]:
folderName = getTimeStr()

In [None]:
# ジャケット写真集収用

import requests as re
from bs4 import BeautifulSoup

imageClass: str = "img-fluid"
cardClass: str = "disco-list"
titleClass: str = "news-heading"
backNumberClass: str = "sub-title"
timeTag: str = "time"
categoryClass: str = "category"
baseURL: str = "https://wezard.net/discography/page/"
maxPage = 13
currentPage = 1

imgURLArr = []

header = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3864.0 Safari/53"
}

create_folder()

count = 0
# ページのループ
for i in range(maxPage, 0, -1):
  res = re.get(baseURL + str(i), headers = header)
  # print(res.text)
  soup = BeautifulSoup(res.text, "html.parser")
  artCards = soup.find_all(
    True,
    attrs={
      'class': cardClass
    }
  )

  # ページ内のカードのループ
  for j in range(len(artCards) - 1, -1, -1):
    item = artCards[j]
    count += 1
    imgElem = item.find(
      "img",
      attrs={
        'class': imageClass
      }
    )
    category = str.lower(
      item.find(
        True,
        attrs={
          'class': categoryClass
        }
      ).text
    )

    if not (category == "single" or category == "album"):
      continue

    # titleはファイル名の使用するのでスラッシュをアンダーバーに置き換える
    title = str.lower(
      item.find(
          True,
          attrs={
            'class': titleClass
          }
      ).text.replace("/", "_")
    )
    timeStr = item.find("time").attrs['datetime'].replace("-", "_")
    backNumberItem = item.find(
      True,
      attrs={
        'class':backNumberClass
      }
    )
    backNumberStr = ""
    if backNumberItem is not None:
      backNumberStr = str.lower(backNumberItem.text.replace(" ", ""))


    if imgElem is not None:
      # カードの中にしっかり目的のクラスを持っている画像があった場合
      imgURL = imgElem['src']
      imgURLArr.append(imgURL)
      response = re.get(imgURL)

      fileNameArg = FileNameArguments(
          timeStr=timeStr,
          backNumberStr=backNumberStr,
          indexStr="index" + str(count),
          title=title
      )
      filename = generate_jacket_image_name(fileNameArg)

      print("filename: " + filename)
      saveFilePath = zardFolderPath + jacketFolderName + "/" + folderName + "/" + filename
      with open(saveFilePath, 'wb') as file:
            file.write(response.content)


# 写真のURLを確認する用
# for url in imgURLArr:
#   print(url)

Folder '2024年07月15日_05:39:42' created successfully.
filename: 1991_02_10_1stsingle_index1.jpg
filename: 1991_03_27_1stalbum_index2.jpg
filename: 1991_06_25_2ndsingle_index3.jpg
filename: 1991_11_06_3rdsingle_index4.jpg
filename: 1991_12_25_2ndalbum_index5.jpg
filename: 1992_08_05_4thsingle_index6.jpg
filename: 1992_09_02_3rdalbum_index7.jpg
filename: 1992_09_09_5thsingle_index8.jpg
filename: 1993_01_27_6thsingle_index9.jpg
filename: 1993_04_21_7thsingle_index10.jpg
filename: 1993_05_19_8thsingle_index11.jpg
filename: 1993_07_10_4thalbum_index12.jpg
filename: 1993_09_04_9thsingle_index13.jpg
filename: 1993_11_03_10thsingle_index14.jpg
filename: 1994_02_02_11thsingle_index15.jpg
filename: 1994_06_04_5thalbum_index16.jpg
filename: 1994_08_08_12thsingle_index17.jpg
filename: 1994_12_24_13thsingle_index18.jpg
filename: 1995_02_01_14thsingle_index19.jpg
filename: 1995_03_10_6thalbum_index20.jpg
filename: 1995_06_05_15thsingle_index21.jpg
filename: 1995_08_28_16thsingle_index22.jpg
filename: 

In [None]:
# jsonからデータクラスにパースできるかのテスト用
# 特に意味はない
import json

filePath = 'drive/MyDrive/ZARD/injectionData/info.json'
fileContent = ""
with open(filePath, "r") as reader:
  fileContent = reader.read()

jsonObj = json.loads(fileContent)
artist = Artist(**jsonObj)

print(artist.artistName)
print(artist.debutYear)
print(artist.debutMonth)


ZARD
1991
2


In [None]:
# getTimeStrに依存
# json書き出し用(一部再利用)
import json
import requests as re
from bs4 import BeautifulSoup

filePath = 'drive/MyDrive/ZARD/injectionData/info.json'
fileContent = ""
with open(filePath, "r") as reader:
  fileContent = reader.read()

jsonObj = json.loads(fileContent)
artist = Artist(**jsonObj)

injectDiscList = artist.releasedDiscs
print("disc count: " + str(len(injectDiscList)))

def findDisc(name, releaseYear, releaseMonth, releaseDate, discType):
  for i in range(0, len(injectDiscList)):
    d = injectDiscList[i]
    if(d.name == name and d.releaseYear == releaseYear and d.releaseMonth == releaseMonth and d.releaseDate == releaseDate and d.discType == discType):
      return d

def getYearMonthDate(str):
  sp = str.split(" ")
  sp2 = sp[0].split("/")
  return sp2[0], sp2[1], sp2[2]


cardClass = "disco-list"
discTypeClass = "category"
titleClass = "news-heading"
backNumberClass = "sub-title"
timeTag = "time"
baseURL = "https://wezard.net/discography/page/"
startPage = 13
endPage = 1
discCount = 0

# デバイスによってレイアウトが変わってCSSクラスも変わるのでうまく取れなくなるので、UserAgentを指定している。
header = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3864.0 Safari/53"
}

discList = []

count = 0
# ページ全体
for i in range(13, 0, -1):
  res = re.get(baseURL + str(i), headers = header)
  # print(res.text)
  listPage = BeautifulSoup(res.text, "html.parser")
  artCards = listPage.find_all(
    True, attrs={ 'class': cardClass}
  )

  # 一つのCD
  for i in range(len(artCards) - 1, -1, -1):
    count += 1
    # CD
    item = artCards[i]
    # 詳細ページのURL
    detailURL = item.find("a").attrs["href"]
    # single or album
    discType = str.lower(
        item.find(True, attrs={
        'class': discTypeClass
    }).text)
    print(discType)
    # 今回欲しいのはCDだけなのでsingleとalbum以外はcontinue
    if not (discType == "single" or discType == "album"):
      continue

    # 例　HOLD ME
    discName = item.find(
      True, attrs={'class' : titleClass}
    ).text.replace("　", " ")

    # discNameとほぼ同じになるが、この変数はCD以外の本などを処理する際にファイル名に使用するので/を_で置き換えている
    title = str.lower(
      item.find(True, attrs={'class' : titleClass}).text.replace("/", "_")
    )

    timeStr = item.find("time").attrs['datetime'].replace("-", "_")
    backNumberItem = item.find(
        True, attrs={'class' : backNumberClass}
    )
    backNumberStr = ""
    if backNumberItem is not None:
      backNumberStr = str.lower(backNumberItem.text.replace(" ", ""))

    fileNameArg = FileNameArguments(
        timeStr=timeStr,
        backNumberStr=backNumberStr,
        indexStr="index" + str(count),
        title=title
    )
    filename = generate_jacket_image_name(fileNameArg)


    releaseDate = item.find("time").text
    print(releaseDate)
    year, month, date = getYearMonthDate(releaseDate)

    is8cm = False
    # 8cmCDかどうか
    if (int(year) < 1999 and discType == "single") or (int(year) <= 1999 and int(month) <= 10 and int(date) <= 14 and discType == "single"):
      is8cm = True

    # 一旦生前のものだけを取りたいので、2007年以降ならばスキップする
    if int(year) > 2007:
      continue

    print(detailURL)
    print(discType)

    # # この配列に１曲ずついれる
    trackList = []

    # dd jsonデータからインデックスを元に引っ張ってきたデータ（前と順番が変わっていなければこれで一致するはず）
    dd = injectDiscList[discCount]
    # jsonデータの順番と現在のサイトの順番が同じだった場合はそのままデータをほりこむ
    # なければデータを探す
    if dd['name'] == discName and dd['releaseYear'] == year and dd['releaseMonth'] == month and dd['releaseDate'] == date and dd['discType'] == discType:
      trackList = dd['trackList']
    else:
      print("データを探す")
      print(dd['name'] + " " + discName)
      print(dd['releaseYear'] + " " + year)
      print(dd['releaseMonth'] + " " + month)
      print(dd['releaseDate'] + " " + date)
      print(dd['discType'] + " " + discType)

    # この時点で曲は全部取り終わってるはず
    disc = Disc(count, discName, year, month, date, backNumberItem.text, discType, is8cm, filename, trackList, detailURL)
    discList.append(disc)
    discCount += 1

information = Artist(discList, "ZARD", "1991", "2", "10")

contentStr: str = replaceCharForJson(str(asdict(information)))
savePath = "drive/MyDrive/ZARD/" + getTimeStr() + "_reuse.json"
with open(savePath, "w", encoding="UTF-8") as writer:
  writer.write(contentStr)




disc count: 66
single
1991/02/10 Release
https://wezard.net/discography/good-bye-my-loneliness-3/
single
album
1991/03/27 Release
https://wezard.net/discography/good-bye-my-loneliness-2/
album
single
1991/06/25 Release
https://wezard.net/discography/discography-263/
single
single
1991/11/06 Release
https://wezard.net/discography/discography-265/
single
album
1991/12/25 Release
https://wezard.net/discography/discography-139/
album
single
1992/08/05 Release
https://wezard.net/discography/discography-267/
single
album
1992/09/02 Release
https://wezard.net/discography/hold-me/
album
single
1992/09/09 Release
https://wezard.net/discography/in-my-arms-tonight-2/
single
single
1993/01/27 Release
https://wezard.net/discography/discography-271/
single
single
1993/04/21 Release
https://wezard.net/discography/discography-273/
single
single
1993/05/19 Release
https://wezard.net/discography/discography-275/
single
album
1993/07/10 Release
https://wezard.net/discography/discography-188/
album
single

In [None]:
# # json書き出し用
# # 旧ver
# import requests as re
# from bs4 import BeautifulSoup

# def getYearMonthDate(str):
#   sp = str.split(" ")
#   sp2 = sp[0].split("/")
#   return sp2[0], sp2[1], sp2[2]

# cardClass = "disco-list"
# discTypeClass = "category"
# titleClass = "news-heading"
# backNumberClass = "sub-title"
# timeTag = "time"
# baseURL = "https://wezard.net/discography/page/"
# endPage = 1
# startPage = 13

# header = {
#     "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3864.0 Safari/53"
# }

# discList = []

# # ページ全体
# for i in range(13, 0, -1):
#   res = re.get(baseURL + str(i), headers = header)
#   # print(res.text)
#   listPage = BeautifulSoup(res.text, "html.parser")
#   artCards = listPage.find_all(True, attrs={
#       'class': cardClass
#   })

#   # 一つのCD
#   for i in range(len(artCards) - 1, -1, -1):
#     # CD
#     item = artCards[i]
#     # 詳細ページのURL
#     detailURL = item.find("a").attrs["href"]
#     # single or album
#     discType = str.lower(item.find(True, attrs={
#         'class': discTypeClass
#     }).text)
#     print(discType)
#     # 今回欲しいのはCDだけなのでsingleとalbum以外はcontinue
#     if not (discType == "single" or discType == "album"):
#       continue

#     # HOLD ME
#     discName = item.find(True, attrs={
#         'class': titleClass
#     }).text.replace("　"," ")

#     # discNameとほぼ同じになるが、この変数はCD以外の本などを処理する差にファイル名に使用するので/を_で置き換えている
#     title = str.lower(item.find(True, attrs={
#           'class':titleClass
#       }).text.replace("/", "_"))
#     timeStr = item.find("time").attrs['datetime'].replace("-", "_")
#     backNumberItem = item.find(True, attrs={
#         'class':backNumberClass
#     })
#     backNumberStr = ""
#     if backNumberItem is not None:
#       backNumberStr = str.lower(backNumberItem.text.replace(" ", ""))

#     filename = ""
#     if backNumberStr == "":
#       filename = "_" + timeStr + title + ".jpg"
#     else:
#       filename = "_" + timeStr + "_" + backNumberStr + ".jpg"


#     releaseDate = item.find("time").text
#     print(releaseDate)
#     year, month, date = getYearMonthDate(releaseDate)

#     is8cm = False
#     # 8cmCDかどうか
#     if (int(year) < 1999 and discType == "single") or (int(year) <= 1999 and int(month) <= 10 and int(date) <= 14 and discType == "single"):
#       is8cm = True


#     if int(year) > 2007:
#       continue

#     print(detailURL)
#     print(discType)

#     # この配列に１曲ずついれる
#     trackList = []

#     # ここからCDの詳細ページの操作
#     res = re.get(detailURL, headers = header)
#     detailPage = BeautifulSoup(res.text, "html.parser")
#     trackListArea = detailPage.find("div", attrs={
#         'class':'disco-content'
#     })
#     trackItemArr = trackListArea.find_all("li")
#     for trackItem in trackItemArr:
#       trackName = str.strip(trackItem.text)
#       tmp = Track(trackName, "坂井泉水", "", "", "")
#       trackList.append(tmp)

#     # この時点で曲は全部撮り終わってるはず
#     disc = Disc(discName, year, month, date, discType, is8cm, filename, trackList)
#     discList.append(disc)

# information = Artist(discList, "ZARD", "1991", "2", "10")

# print(asdict(information))
# savePath = "drive/MyDrive/ZARD/" + getTimeStr() + ".json"
# with open(savePath, "w", encoding="UTF-8") as writer:
#   writer.write(str(asdict(information)))




In [None]:
# for i in range(13, -1, -1):
#   print(i)

import datetime

dt_now = datetime.datetime.now()
print(dt_now)
time = dt_now.strftime('%Y年%m月%d日_%H:%S')
print(time)

2024-07-15 05:42:00.112450
2024年07月15日_05:00
