In [1]:
import requests
from bs4 import BeautifulSoup
import json
import re
import pandas as pd

In [2]:
def fetch_page_content(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

In [3]:
# Function to extract individual product links from the main page
def extract_product_links(soup):
    product_links = []
    for a in soup.select('a.product-item-link'):
        link = a.get('href')
        if link:
            product_links.append(link)
    return product_links

In [4]:
def extract_additional_details(soup):
    details = {
        'Size': 'N/A',
        'Product Type': 'N/A',
        'Signed By': 'N/A',
        'Presentation Type': 'N/A'
    }
    
    for row in soup.select('table.data tr'):
        th = row.select_one('th').text.strip()
        td = row.select_one('td').text.strip()
        if 'Presentation size' in th:
            details['Size'] = td
        elif 'Product type' in th:
            details['Product Type'] = td
        elif 'Signed by' in th:
            details['Signed By'] = td
        elif 'Presentation type' in th:
            details['Presentation Type'] = td
            
    return details

In [5]:
def extract_product_details(product_link):
    soup = fetch_page_content(product_link)
    title = soup.select_one('.page-title').text.strip()
    price = soup.select_one('.price').text.strip()
    availability = soup.select_one('.stock').text.strip()

    for div in soup.select('.product.size'):
        if 'Dispatch time' in div.text:
            dispatch_time = div.text.split(':')[-1].strip()
    
    additional_details = extract_additional_details(soup)
    
    return {
        'Title': title,
        'Price': price,
        'Availability': availability,
        'Size': additional_details['Size'],
        'Dispatch Time': dispatch_time,
        'Product Type': additional_details['Product Type'],
        'Signed By': additional_details['Signed By'],
        'Presentation Type': additional_details['Presentation Type']
    }

In [6]:
def save_to_csv(products, filename):
    df = pd.DataFrame(products)
    df.to_csv(filename, index=False)

In [7]:
# Function to get all product links for a player across the various display pages of the items
def get_all_product_links(base_url):
    all_product_links = []
    page = 1

    while True:
        url = f"{base_url}?p={page}"
        soup = fetch_page_content(url)
        product_links = extract_product_links(soup)
        if not product_links:
            break
        all_product_links.extend(product_links)
        page += 1
    
    return all_product_links

In [8]:
urls = [
    "https://www.icons.com/players/a-k/sergio-aguero-signed-memorabilia.html",
    "https://www.icons.com/players/a-k/thiago-alcantara-signed-football-memorabilia.html",
    "https://www.icons.com/players/a-k/trent-alexander-arnold-signed-memorabilia.html",
    "https://www.icons.com/players/a-k/dele-alli-signed-memorabilia.html",
    "https://www.icons.com/players/a-k/michail-antonio-signed-memorabilia.html",
    "https://www.icons.com/players/a-k/pierre-emerick-aubameyang-signed-memorabilia.html",
    "https://www.icons.com/players/a-k/ross-barkley-signed-memorabilia.html",
    "https://www.icons.com/players/a-k/alisson-becker-signed-memorabilia.html",
    "https://www.icons.com/players/a-k/ryan-bertrand-signed-memorabilia.html",
    "https://www.icons.com/players/a-k/karim-benzema-signed-memorabilia.html",
    "https://www.icons.com/players/a-k/leonardo-bonucci-signed-memorabilia.html",
    "https://www.icons.com/players/a-k/jarrod-bowen-signed-memorabilia.html",
    "https://www.icons.com/players/a-k/gianluigi-buffon-signed-memorabilia.html",
    "https://www.icons.com/players/a-k/sergio-busquets-signed-football-memorabilia.html",
    "https://www.icons.com/players/a-k/jack-butland-signed-memorabilia.html",
    "https://www.icons.com/players/a-k/gary-cahill-signed-memorabilia.html",
    "https://www.icons.com/players/a-k/dominic-calvert-lewin-signed-memorabilia.html",
    "https://www.icons.com/players/a-k/eduardo-camavinga-signed-memorabilia.html",
    "https://www.icons.com/players/a-k/andy-carroll-signed-memorabilia.html",
    "https://www.icons.com/players/a-k/casemiro-signed-memorabilia.html",
    "https://www.icons.com/players/a-k/santi-cazorla-signed-memorabilia.html",
    "https://www.icons.com/players/a-k/nathaniel-chalobah-signed-memorabilia.html",
    "https://www.icons.com/players/a-k/giorgino-chiellini-signed-memorabilia.html",
    "https://www.icons.com/players/a-k/federico-chiesa-signed-memorabilia.html",
    "https://www.icons.com/players/a-k/nathaniel-clyne-signed-memorabilia.html",
    "https://www.icons.com/players/a-k/sebastian-cordova-signed-memorabilia.html",
    "https://www.icons.com/players/a-k/diego-costa-signed-memorabilia.html",
    "https://www.icons.com/players/a-k/philippe-coutinho-signed-memorabilia.html",
    "https://www.icons.com/players/a-k/aaron-cresswell-signed-memorabilia.html"
]

player_names = [
    "Sergio Aguero", "Thiago Alcantara", "Trent Alexander-Arnold", "Dele Alli", "Michail Antonio",
    "Pierre-Emerick Aubameyang", "Ross Barkley", "Alisson Becker", "Ryan Bertrand", "Karim Benzema", 
    "Leonardo Bonucci", "Jarrod Bowen", "Gianluigi Buffon", "Sergio Busquets", "Jack Butland", "Gary Cahill",
    "Dominic Calvert-Lewin", "Eduardo Camavinga", "Andy Carroll", "Casemiro", "Santi Cazorla", "Nathaniel Chalobah", 
    "Giorgio Chiellini", "Federico Chiesa", "Nathaniel Clyne", "Sebastian Cordova", "Diego Costa", "Philippe Coutinho", 
    "Aaron Cresswell"
]

all_products = []

In [9]:
# Iterate over every URL in the list to extract the product details
for url, player_name in zip(urls, player_names):
    product_links = get_all_product_links(url)
    for product_link in product_links:
        product_details = extract_product_details(product_link)
        product_details['Player Name'] = player_name  # Add player name to product details
        all_products.append(product_details)

In [10]:
save_to_csv(all_products, 'RawData.csv')
print("Data collection complete.")

Data collection complete.


In [11]:
df = pd.read_csv('RawData.csv') 
df

Unnamed: 0,Title,Price,Availability,Size,Dispatch Time,Product Type,Signed By,Presentation Type,Player Name
0,UNSIGNED Sergio Aguero Photo In Black Wooden F...,£49.99,In stock,570mm (H) x 470mm (W) x 30mm (D),Usually dispatched within 3-4 working days,Signed photos,Unsigned,Framed,Sergio Aguero
1,UNSIGNED Sergio Aguero Photo In Black Wooden F...,£49.99,In stock,570mm (H) x 470mm (W) x 30mm (D),Usually dispatched within 3-4 working days,Signed photos,Unsigned,Framed,Sergio Aguero
2,UNSIGNED Sergio Aguero Photo In Black Wooden F...,£49.99,In stock,570mm (H) x 470mm (W) x 30mm (D),Usually dispatched within 3-4 working days,Signed photos,Unsigned,Framed,Sergio Aguero
3,UNSIGNED Sergio Aguero Photo In Black Wooden F...,£49.99,In stock,570mm (H) x 470mm (W) x 30mm (D),Usually dispatched within 3-4 working days,Signed photos,Unsigned,Framed,Sergio Aguero
4,UNSIGNED Sergio Aguero Photo In Black Wooden F...,£49.99,In stock,570mm (H) x 470mm (W) x 30mm (D),Usually dispatched within 3-4 working days,Signed photos,Unsigned,Framed,Sergio Aguero
...,...,...,...,...,...,...,...,...,...
687,Philippe Coutinho Signed and Liverpool Framed ...,£399.99,Out of stock,,Usually dispatched within 8-10 working days,Signed boots,Philippe Coutinho,Framed,Philippe Coutinho
688,Philippe Coutinho Signed Yellow and Orange Nik...,£234.99,Out of stock,340mm (W) x 220mm (H) x 110mm (D),Usually dispatched within 2-3 working days,Signed boots,Philippe Coutinho,Deluxe Packaged,Philippe Coutinho
689,Philippe Coutinho Signed Yellow and Orange Nik...,£199.99,Out of stock,,Usually dispatched within 2-3 working days,Signed boots,Philippe Coutinho,Unframed,Philippe Coutinho
690,Aaron Cresswell Signed England Photo,£149.99,In stock,,Usually dispatched within 2-3 working days,Signed photos,Aaron Cresswell,Unframed,Aaron Cresswell
