<a href="https://colab.research.google.com/github/ThousandAI/Web-Crawler-via-AI/blob/main/scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scrape

In [None]:
import requests
import os
from bs4 import BeautifulSoup

url = "https://scrapeme.live/shop/"

response = requests.get(url)
print(response)

In [None]:
if response.status_code == 200:
  soup = BeautifulSoup(response.text, "html.parser")
  web_title = soup.find("div", class_="beta site-title").find("a").text.strip()
  print(web_title)

In [None]:
products = soup.find_all("li", class_="product")
for product in products:
  title = product.find("h2", class_="woocommerce-loop-product__title").text.strip()
  price = product.find("span", class_="woocommerce-Price-amount").text.strip()
  print(f"Product name: {title}, Price: {price}")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

drive_folder = "/content/drive/MyDrive/ThousandAI 程式教育/WebCrawler/pokemon-images/"

os.makedirs(drive_folder, exist_ok=True)

products = soup.find_all("li", class_="product")

for product in products:
  title = product.find("h2", class_="woocommerce-loop-product__title").text.strip()
  img_tag = product.find("img")

  if img_tag and "src" in img_tag.attrs:
    img_url = img_tag["src"]
    img_data = requests.get(img_url).content

    img_filename = os.path.join(drive_folder, f"{title.replace(' ', '_')}.jpg")

    with open(img_filename, "wb") as img_file:
        img_file.write(img_data)

    print(f"Already Download: {img_filename}")



In [None]:
from tqdm import tqdm
import time
for i in tqdm(range(1, 101), desc="Processing", unit=" steps"):
  time.sleep(0.1)

In [None]:
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
from google.colab import drive
from tqdm import tqdm

drive.mount('/content/drive')

drive_folder = "/content/drive/MyDrive/ThousandAI 程式教育/WebCrawler/pokemon-images/"
csv_file = "/content/drive/MyDrive/ThousandAI 程式教育/WebCrawler/scrapeme_products.csv"

os.makedirs(drive_folder, exist_ok=True)

base_url = "https://scrapeme.live/shop/page/{}/"

all_products = []

max_pages = 48

for page in tqdm(range(1, max_pages + 1), desc="Processing Pages", unit=" page"):
    url = base_url.format(page)
    response = requests.get(url)

    soup = BeautifulSoup(response.text, "html.parser")
    products = soup.find_all("li", class_="product")

    for product in products:
        title = product.find("h2", class_="woocommerce-loop-product__title").text.strip()
        price = product.find("span", class_="woocommerce-Price-amount").text.strip()
        img_tag = product.find("img")

        if img_tag and "src" in img_tag.attrs:
            img_url = img_tag["src"]
            img_filename = os.path.join(drive_folder, f"{title.replace(' ', '_')}.jpg")
            img_data = requests.get(img_url).content
            with open(img_filename, "wb") as img_file:
                img_file.write(img_data)

        all_products.append([title, price, img_url])

df = pd.DataFrame(all_products, columns=["Product Name", "Price", "Img URL"])

df.to_csv(csv_file, index=False, encoding="utf-8-sig")

print(f"Scraping completed! Data saved in: {csv_file}")



# Pandas

In [None]:
import pandas as pd
import numpy as np

data = {
    '姓名': ['小明', '小華', '小美', '小強', '小張'],
    '年齡': [25, 30, 22, 28, 26],
    '成績': [85, 90, 88, 76, 95],
    '城市': ['台北', '台中', '台北', '高雄', '台中']
}

df = pd.DataFrame(data)
print("原始 DataFrame：")
print(df)

In [None]:
print("\n資料資訊：")
print(df.info())

print("\n統計摘要：")
print(df.describe())

print("\n選取 '姓名' 和 '成績'：")
print(df[['姓名', '成績']])

filtered_df = df[df['成績'] > 85]
print("\n成績大於 85 的人：")
print(filtered_df)

sorted_df = df.sort_values(by='成績', ascending=False)
print("\n按成績排序（由高到低）：")
print(sorted_df)

extra_data = pd.DataFrame({
    '姓名': ['小明', '小華', '小美', '小強', '小張'],
    '性別': ['男', '男', '女', '男', '男']
})
merged_df = pd.merge(df, extra_data, on='姓名', how='left')
print("\n合併 '性別' 資料：")
print(merged_df)

df.loc[2, '成績'] = np.nan
print("\n含有 NaN 值的 DataFrame：")
print(df)

df_filled = df.fillna(df['成績'].mean())
print("\nNaN 值填補後的 DataFrame：")
print(df_filled)

df.to_csv('sample_data.csv', index=False)
df_loaded = pd.read_csv('sample_data.csv')
print("\n讀取 CSV 檔案：")
print(df_loaded)

# Matplotlib

In [None]:
import matplotlib.pyplot as plt
import numpy as np

np.random.seed(42)
x = np.arange(1, 11)
y1 = np.random.randint(5, 20, size=10)
y2 = np.random.randint(5, 20, size=10)

plt.figure(figsize=(6, 4))
plt.plot(x, y1, marker='o', linestyle='-', color='b', label='Dataset A')
plt.plot(x, y2, marker='s', linestyle='--', color='r', label='Dataset B')

plt.xlabel('X axis')
plt.ylabel('Y axis')
plt.title('Line Chart with Two Datasets')
plt.legend()
plt.grid()
plt.show()


In [None]:
categories = ['A', 'B', 'C', 'D']
values = [10, 15, 7, 12]

plt.figure(figsize=(6, 4))
plt.bar(categories, values, color=['red', 'blue', 'green', 'orange'])
plt.xlabel('Class')
plt.ylabel('Value')
plt.title('Bar Chart')
plt.show()

In [None]:
x = np.random.rand(50)
y = np.random.rand(50)

plt.figure(figsize=(6, 4))
plt.scatter(x, y, color='purple', alpha=0.7)
plt.xlabel('X axis')
plt.ylabel('Y axis')
plt.title('Scatter Chart')
plt.show()

In [None]:
labels = ['Python', 'Javascript', 'C++', 'Java', 'C']
sizes = [30, 25, 20, 25, 20]

plt.figure(figsize=(6, 6))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', colors=['red', 'yellow', 'brown', 'orange', 'blue'])
plt.title('Pie Chart')
plt.show()