In [33]:
import time
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import TimeoutException, ElementClickInterceptedException

class OrientBellScraper:
    def __init__(self, chromedriver_path):
        chrome_options = Options()
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--window-size=1920,1080')
        
        service = Service(chromedriver_path)
        self.driver = webdriver.Chrome(service=service, options=chrome_options)
        self.base_url = "https://www.orientbell.com/tiles/swimming-pool-tiles"
        self.tile_data = []
        
        # Create images directory if it doesn't exist
        self.images_dir = "images"
        if not os.path.exists(self.images_dir):
            os.makedirs(self.images_dir)

    def _wait_for_element(self, by, value, timeout=10):
        try:
            element = WebDriverWait(self.driver, timeout).until(
                EC.presence_of_element_located((by, value))
            )
            return element
        except TimeoutException:
            return None

    def _handle_popup(self):
        """Handle any popups that might appear"""
        try:
            # Wait for popup and close button
            popup_close = WebDriverWait(self.driver, 5).until(
                EC.presence_of_element_located((By.CLASS_NAME, "popup-close"))
            )
            popup_close.click()
            time.sleep(1)
        except TimeoutException:
            pass

    def _scroll_and_wait(self):
        """Enhanced scroll with popup handling"""
        last_height = self.driver.execute_script("return document.body.scrollHeight")
        
        while True:
            # Scroll in smaller increments
            for i in range(0, last_height, 300):
                self.driver.execute_script(f"window.scrollTo(0, {i});")
                time.sleep(0.5)
                self._handle_popup()
            
            time.sleep(2)
            new_height = self.driver.execute_script("return document.body.scrollHeight")
            
            if new_height == last_height:
                break
            last_height = new_height

    def _click_load_more(self):
        try:
            self._handle_popup()
            button = self._wait_for_element(By.CLASS_NAME, "load-more-product")
            if not button or not button.is_displayed():
                return False

            # Scroll to button with offset
            self.driver.execute_script(
                "arguments[0].scrollIntoView({block: 'center'});", 
                button
            )
            time.sleep(2)

            # Try multiple click methods
            try:
                button.click()
            except:
                try:
                    self.driver.execute_script("arguments[0].click();", button)
                except:
                    actions = ActionChains(self.driver)
                    actions.move_to_element(button).click().perform()

            time.sleep(3)  # Wait longer after click
            return True
        except Exception as e:
            print(f"Click failed: {str(e)}")
            return False

    def _download_image(self, image_url, tile_name):
        """Download image and save with sanitized filename"""
        try:
            if not image_url:
                return None
                
            # Sanitize filename
            valid_chars = '-_.() abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
            sanitized_name = ''.join(c for c in tile_name if c in valid_chars)
            filename = os.path.join(self.images_dir, f"{sanitized_name}.jpg")
            
            response = requests.get(image_url, stream=True)
            if response.status_code == 200:
                with open(filename, 'wb') as f:
                    for chunk in response.iter_content(1024):
                        f.write(chunk)
                return filename
        except Exception as e:
            print(f"Error downloading image: {str(e)}")
        return None

    def _extract_tile_data(self, soup):
        tiles = soup.select(".product-listing-col")
        data = []
        
        for tile in tiles:
            try:
                # Basic tile information
                name_element = tile.select_one(".simi-name a")
                name = name_element.text.strip() if name_element else None
                tile_url = name_element["href"] if name_element else None
                
                # Extract high-quality image URL from figure
                figure_element = tile.select_one(".cate-pro-figure")
                high_quality_image = None
                if figure_element:
                    img_element = figure_element.select_one("img")
                    if img_element and img_element.has_attr('data-src'):
                        high_quality_image = img_element['data-src']
                    elif img_element and img_element.has_attr('src'):
                        high_quality_image = img_element['src']
                
                # Download image if URL found
                local_image_path = self._download_image(high_quality_image, name) if high_quality_image else None
                
                # Other tile details
                size_element = tile.select_one(".produ-size-leftbox")
                size = size_element.text.strip().replace("Size", "").strip() if size_element else None
                
                status_element = tile.select_one(".produ-size-rightbox span")
                status = status_element.text.strip() if status_element else None
                
                price_element = tile.select_one(".produ-price")
                price = price_element.text.strip() if price_element else "Price not available"
                
                data.append({
                    "Name": name,
                    "URL": tile_url,
                    "Size": size,
                    "Availability": status,
                    "Price": price,
                    "Image URL": high_quality_image,
                    "Local Image Path": local_image_path
                })
            except Exception as e:
                print(f"Error extracting tile data: {str(e)}")
                continue
        
        return data

    def scrape_all_tiles(self, max_clicks=100, min_tiles=500):
        try:
            print("Loading initial page...")
            self.driver.get(self.base_url)
            self._wait_for_element(By.CLASS_NAME, "product-listing-col", timeout=20)
            
            clicks = 0
            previous_count = 0
            no_change_count = 0
            
            while clicks < max_clicks:
                self._scroll_and_wait()
                
                current_tiles = len(self.driver.find_elements(By.CLASS_NAME, "product-listing-col"))
                print(f"Current number of tiles: {current_tiles}")
                
                if current_tiles >= min_tiles:
                    print(f"Reached minimum tile count ({min_tiles})")
                    break
                
                if current_tiles == previous_count:
                    no_change_count += 1
                    if no_change_count >= 3:  # Try 3 times before giving up
                        print("No new content after multiple attempts")
                        break
                else:
                    no_change_count = 0
                
                if not self._click_load_more():
                    print("Failed to find or click load more button")
                    break
                
                previous_count = current_tiles
                clicks += 1
            
            print("Extracting all tile data...")
            soup = BeautifulSoup(self.driver.page_source, "html.parser")
            self.tile_data = self._extract_tile_data(soup)
            
            print(f"\nTotal tiles found: {len(self.tile_data)}")
            return self.tile_data
            
        finally:
            self.driver.quit()

if __name__ == "__main__":
    try:
        chromedriver_path = r"chromedriver.exe"
        scraper = OrientBellScraper(chromedriver_path)
        all_tiles = scraper.scrape_all_tiles(min_tiles=500)
        
        for tile in all_tiles:
            print(tile)
            print("-" * 80)
    except Exception as e:
        print(f"An error occurred: {str(e)}")

Loading initial page...
Current number of tiles: 27
Current number of tiles: 27
Failed to find or click load more button
Extracting all tile data...

Total tiles found: 27
{'Name': 'TL Almond Terrazzo', 'URL': 'https://www.orientbell.com/tl-almond-terrazzo', 'Size': '400x400 mm ft', 'Availability': '', 'Price': 'MRP  ₹ 94 /- Sq.ftBuy Now', 'Image URL': 'https://images.orientbell.com/media/catalog/product/t/l/tl_almond_terrazzo.jpg', 'Local Image Path': None}
--------------------------------------------------------------------------------
{'Name': 'TL Camel Brick Emboss Art', 'URL': 'https://www.orientbell.com/tl-camel-brick-emboss-art', 'Size': '400x400 mm ft', 'Availability': '', 'Price': 'MRP  ₹ 94 /- Sq.ftBuy Now', 'Image URL': 'https://images.orientbell.com/media/catalog/product/t/l/tl_camel_hexa_brick_emboss_art.jpg', 'Local Image Path': None}
--------------------------------------------------------------------------------
{'Name': 'TL Multi Terrazzo Modern Inlay', 'URL': 'https:/

In [3]:
import pandas as pd

# Load the CSV file
file_path = "tiles_data_final.csv"  # Update with your file path
data = pd.read_csv(file_path)

# Remove exact duplicate rows (all columns must match)
deduplicated_data = data.drop_duplicates()

# Save the deduplicated DataFrame to a new CSV file
output_file_path = "tiles_2_final_data.csv"
deduplicated_data.to_csv(output_file_path, index=False)

print(f"Duplicate rows removed. Deduplicated data saved to '{output_file_path}'.")


Duplicate rows removed. Deduplicated data saved to 'tiles_2_final_data.csv'.


In [4]:
import os
import hashlib
from shutil import copy2

def calculate_hash(file_path):
    """Calculate the MD5 hash of a file."""
    hash_md5 = hashlib.md5()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

def remove_duplicates(images_folder, output_folder):
    """Remove duplicate images and save unique ones to a new folder."""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    seen_hashes = set()
    for root, _, files in os.walk(images_folder):
        for file in files:
            file_path = os.path.join(root, file)
            if os.path.isfile(file_path):
                file_hash = calculate_hash(file_path)
                if file_hash not in seen_hashes:
                    seen_hashes.add(file_hash)
                    copy2(file_path, output_folder)
                    print(f"Copied: {file}")
                else:
                    print(f"Duplicate Found: {file}")

# Paths
images_folder = "images"  # Replace with your folder path
output_folder = "images_final"  # Replace with your output folder path

remove_duplicates(images_folder, output_folder)


Copied: 300X450 OHG Temple Diya Songket HL 6 Pcs Prem.jpg
Copied: Anti-Skid EC Baby Blue.jpg
Copied: Anti-Skid EC Coral Aqua.jpg
Copied: Anti-Skid EC Creama.jpg
Copied: Anti-Skid EC Fish Pebble.jpg
Copied: Anti-Skid EC Honey Peach.jpg
Copied: Anti-Skid EC Ivory.jpg
Copied: Anti-Skid EC Mahogany Brown.jpg
Copied: Anti-Skid EC Nero.jpg
Copied: Anti-Skid EC Pebble.jpg
Copied: Anti-Skid EC Venezia Wood DK.jpg
Copied: Anti-Skid EC Warm Grey.jpg
Copied: Anti-Skid EC White.jpg
Copied: Armor Beige.jpg
Copied: Armor Brown.jpg
Copied: Baby Satin Calacatta Fantasy Marble.jpg
Copied: Baby Satin Onyx Marble.jpg
Copied: BDF 5x5 Moroccan Blue FT.jpg
Copied: BDF Desert Marble Beige FT.jpg
Copied: BDF Desert Moroccan Star Multi HL FT.jpg
Copied: BDF Hardwood Strips Multi FT.jpg
Copied: BDF Koa Plank Brown FT.jpg
Copied: BDF Roccia Almond Pebbles FT.jpg
Copied: BDF Rubra Strip Multi FT.jpg
Copied: BDF Smoky Geometric Multi HL FT.jpg
Copied: BDF Statuario Marble FT.jpg
Copied: BDF Triangle Mosaic Grey HL

In [8]:
import os
import requests

# Folder where images will be stored
images_folder_path = "images_final"  # Ensure this folder exists
if not os.path.exists(images_folder_path):
    os.makedirs(images_folder_path)

# URLs and names
images = [
    {
        "url": "https://images.orientbell.com/media/catalog/product/d/r/dr_matte_breccia_blue_gold_vein_f1.jpg",
        "name": "DR Matte Breccia Blue Gold Vein"
    },
    {
        "url": "https://images.orientbell.com/media/catalog/product/d/r/dr_matte_onyx_cloudy_blue_marble_f1.jpg",
        "name": "DR Matte Onyx Cloudy Blue Marble"
    },
    {
        "url": "https://images.orientbell.com/media/catalog/product/d/r/dr_matte_amazonite_aqua_marble_f1-2_copy.jpg",
        "name": "DR Matte Amazonite Aqua Marble"
    },
    {
        "url": "https://images.orientbell.com/media/catalog/product/d/r/dr_matte_endless_canova_statuario_f1.jpg",
        "name": "DR Matte Endless Canova Statuario"
    }
]

# Function to download images
def download_image(image_url, save_path):
    try:
        response = requests.get(image_url, stream=True, timeout=10)
        if response.status_code == 200:
            with open(save_path, "wb") as file:
                for chunk in response.iter_content(1024):
                    file.write(chunk)
            print(f"Downloaded: {save_path}")
        else:
            print(f"Failed to download {image_url}. Status code: {response.status_code}")
    except Exception as e:
        print(f"Error downloading {image_url}: {e}")

# Download the images
for image in images:
    filename = f"{image['name']}.jpg"  # Add .jpg extension
    save_path = os.path.join(images_folder_path, filename)
    download_image(image["url"], save_path)


Failed to download https://images.orientbell.com/media/catalog/product/d/r/dr_matte_breccia_blue_gold_vein_f1.jpg. Status code: 403
Failed to download https://images.orientbell.com/media/catalog/product/d/r/dr_matte_onyx_cloudy_blue_marble_f1.jpg. Status code: 403
Failed to download https://images.orientbell.com/media/catalog/product/d/r/dr_matte_amazonite_aqua_marble_f1-2_copy.jpg. Status code: 403
Failed to download https://images.orientbell.com/media/catalog/product/d/r/dr_matte_endless_canova_statuario_f1.jpg. Status code: 403


In [23]:
import requests
import xml.etree.ElementTree as ET
import csv
from urllib.parse import urljoin
import time

class SitemapExtractor:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
    def fetch_sitemap(self, url):
        """Fetch sitemap content with retry mechanism"""
        max_retries = 3
        for attempt in range(max_retries):
            try:
                response = requests.get(url, headers=self.headers, timeout=30)
                response.raise_for_status()
                return response.content
            except Exception as e:
                if attempt == max_retries - 1:
                    print(f"Failed to fetch {url} after {max_retries} attempts: {str(e)}")
                    return None
                time.sleep(2)  # Wait before retry

    def parse_ositemap(self, xml_content):
        """Parse the ositemap.xml file"""
        if not xml_content:
            return []
        
        urls = []
        try:
            root = ET.fromstring(xml_content)
            for url_element in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}url"):
                loc = url_element.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc")
                if loc is not None and loc.text:
                    urls.append({
                        'url': loc.text,
                        'source': 'ositemap'
                    })
        except ET.ParseError as e:
            print(f"Error parsing ositemap XML: {str(e)}")
        
        return urls

    def parse_sitemap_with_images(self, xml_content):
        """Parse the sitemapwithimages.xml file"""
        if not xml_content:
            return [], []
        
        urls = []
        images = []
        
        try:
            root = ET.fromstring(xml_content)
            for url_element in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}url"):
                # Get page URL
                loc = url_element.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc")
                if loc is not None and loc.text:
                    urls.append({
                        'url': loc.text,
                        'source': 'sitemapwithimages'
                    })
                
                    # Get associated images
                    for image in url_element.findall(".//{http://www.google.com/schemas/sitemap-image/1.1}image"):
                        image_loc = image.find("{http://www.google.com/schemas/sitemap-image/1.1}loc")
                        image_title = image.find("{http://www.google.com/schemas/sitemap-image/1.1}title")
                        if image_loc is not None and image_loc.text:
                            images.append({
                                'page_url': loc.text,
                                'image_url': image_loc.text,
                                'image_title': image_title.text if image_title is not None else ''
                            })
        except ET.ParseError as e:
            print(f"Error parsing sitemap with images XML: {str(e)}")
        
        return urls, images

    def save_urls_to_csv(self, urls, filename='orientbell_urls.csv'):
        """Save URLs to CSV file"""
        if not urls:
            print("No URLs to save!")
            return
        
        try:
            with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
                writer = csv.DictWriter(csvfile, fieldnames=['url', 'source'])
                writer.writeheader()
                writer.writerows(urls)
            print(f"Successfully saved {len(urls)} URLs to {filename}")
        except Exception as e:
            print(f"Error saving URLs to CSV: {str(e)}")

    def save_images_to_csv(self, images, filename='orientbell_images.csv'):
        """Save image data to CSV file"""
        if not images:
            print("No images to save!")
            return
        
        try:
            with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
                writer = csv.DictWriter(csvfile, fieldnames=['page_url', 'image_url', 'image_title'])
                writer.writeheader()
                writer.writerows(images)
            print(f"Successfully saved {len(images)} images to {filename}")
        except Exception as e:
            print(f"Error saving images to CSV: {str(e)}")

    def extract_all(self):
        """Extract all URLs and images from both sitemaps"""
        # Process ositemap.xml
        print("Fetching ositemap.xml...")
        osi_content = self.fetch_sitemap("https://www.orientbell.com/media/ositemap.xml")
        osi_urls = self.parse_ositemap(osi_content)
        
        # Process sitemapwithimages.xml
        print("Fetching sitemapwithimages.xml...")
        img_content = self.fetch_sitemap("https://www.orientbell.com/sitemapwithimages.xml")
        img_urls, images = self.parse_sitemap_with_images(img_content)
        
        # Combine URLs from both sources
        all_urls = osi_urls + img_urls
        
        # Save results
        self.save_urls_to_csv(all_urls, 'orientbell_all_urls.csv')
        self.save_images_to_csv(images, 'orientbell_all_images.csv')
        
        return len(all_urls), len(images)

if __name__ == "__main__":
    extractor = SitemapExtractor()
    total_urls, total_images = extractor.extract_all()
    print(f"\nExtraction complete!")
    print(f"Total URLs extracted: {total_urls}")
    print(f"Total images extracted: {total_images}")

Fetching ositemap.xml...
Fetching sitemapwithimages.xml...
Successfully saved 6204 URLs to orientbell_all_urls.csv
Successfully saved 5515 images to orientbell_all_images.csv

Extraction complete!
Total URLs extracted: 6204
Total images extracted: 5515


In [24]:
! pip install pandas requests tqdm

Defaulting to user installation because normal site-packages is not writeable


In [26]:
import pandas as pd
import requests
import os
import re
from urllib.parse import unquote
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import hashlib
import time

class ImageDatasetCreator:
    def __init__(self, csv_file, output_dir="orientbell_dataset"):
        self.csv_file = csv_file
        self.output_dir = output_dir
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        self.failed_downloads = []
        
    def clean_filename(self, filename):
        """Clean filename to be valid for all operating systems"""
        # Remove invalid characters
        filename = re.sub(r'[<>:"/\\|?*]', '', filename)
        # Replace spaces with underscores
        filename = filename.replace(' ', '_')
        # Remove any non-ASCII characters
        filename = re.sub(r'[^\x00-\x7F]+', '', filename)
        # Ensure filename is not too long (max 255 characters)
        if len(filename) > 255:
            name, ext = os.path.splitext(filename)
            filename = name[:255-len(ext)] + ext
        return filename.lower()

    def extract_tile_name(self, image_title, image_url):
        """Extract a meaningful name for the tile from title or URL"""
        if image_title and len(image_title) > 0:
            # Use image title if available
            name = image_title
        else:
            # Extract name from URL
            name = unquote(os.path.basename(image_url))
            name = os.path.splitext(name)[0]
            
        # Clean the name
        clean_name = self.clean_filename(name)
        
        # If name becomes empty after cleaning, use URL hash
        if not clean_name:
            hash_object = hashlib.md5(image_url.encode())
            clean_name = hash_object.hexdigest()[:12]
            
        return clean_name

    def download_image(self, row):
        """Download a single image with retry mechanism"""
        image_url = row['image_url']
        image_title = row['image_title']
        
        try:
            # Generate filename
            name = self.extract_tile_name(image_title, image_url)
            
            # Get file extension from URL
            ext = os.path.splitext(image_url)[1]
            if not ext:
                ext = '.jpg'  # Default extension
            
            filename = f"{name}{ext}"
            filepath = os.path.join(self.output_dir, filename)
            
            # Skip if file already exists
            if os.path.exists(filepath):
                return True
            
            # Try downloading with retries
            max_retries = 3
            for attempt in range(max_retries):
                try:
                    response = requests.get(image_url, headers=self.headers, timeout=30)
                    response.raise_for_status()
                    
                    # Save the image
                    with open(filepath, 'wb') as f:
                        f.write(response.content)
                    return True
                    
                except requests.exceptions.RequestException as e:
                    if attempt == max_retries - 1:
                        self.failed_downloads.append({
                            'url': image_url,
                            'title': image_title,
                            'error': str(e)
                        })
                        return False
                    time.sleep(2)  # Wait before retry
                    
        except Exception as e:
            self.failed_downloads.append({
                'url': image_url,
                'title': image_title,
                'error': str(e)
            })
            return False

    def create_dataset(self, max_workers=5):
        """Create the dataset by downloading all images"""
        # Create output directory
        os.makedirs(self.output_dir, exist_ok=True)
        
        # Read CSV file
        df = pd.read_csv(self.csv_file)
        total_images = len(df)
        
        print(f"Starting download of {total_images} images...")
        
        # Download images using thread pool
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            results = list(tqdm(
                executor.map(self.download_image, df.to_dict('records')),
                total=total_images,
                desc="Downloading images"
            ))
        
        # Calculate statistics
        successful = sum(results)
        failed = len(self.failed_downloads)
        
        # Print summary
        print("\nDownload Summary:")
        print(f"Total images: {total_images}")
        print(f"Successfully downloaded: {successful}")
        print(f"Failed: {failed}")
        
        # Save failed downloads log if any
        if self.failed_downloads:
            failed_log = os.path.join(self.output_dir, 'failed_downloads.csv')
            pd.DataFrame(self.failed_downloads).to_csv(failed_log, index=False)
            print(f"\nFailed downloads have been logged to: {failed_log}")
        
        return successful, failed

if __name__ == "__main__":
    # Create dataset
    creator = ImageDatasetCreator(
        csv_file='orientbell_all_images.csv',
        output_dir='orientbell_dataset'
    )
    
    successful, failed = creator.create_dataset()

Starting download of 5515 images...


Downloading images: 100%|██████████| 5515/5515 [2:25:09<00:00,  1.58s/it]  



Download Summary:
Total images: 5515
Successfully downloaded: 1538
Failed: 3977

Failed downloads have been logged to: orientbell_dataset\failed_downloads.csv


In [36]:
import requests
from bs4 import BeautifulSoup
import os
import time
import pandas as pd
from urllib.parse import urlparse
import csv

class TileScraper:
    def __init__(self):
        self.base_url = "https://www.orientbell.com"
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
        }
        self.image_dir = 'images'
        os.makedirs(self.image_dir, exist_ok=True)

    def download_image(self, image_url, tile_name):
        try:
            valid_chars = '-_.() abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
            filename = ''.join(c for c in tile_name if c in valid_chars)
            filepath = f"images/{filename}.jpg"
            
            if os.path.exists(filepath):
                return filepath
            
            response = requests.get(image_url, headers=self.headers, timeout=10)
            if response.status_code == 200:
                with open(filepath, 'wb') as f:
                    f.write(response.content)
                time.sleep(0.2)  # Small delay between downloads
                return filepath
        except Exception as e:
            print(f"Error downloading {tile_name}: {e}")
        return None

    def extract_category(self, url):
        try:
            path = urlparse(url).path
            parts = path.split('/')
            if len(parts) > 2:
                return parts[2]
        except:
            pass
        return 'general'

    def scrape_page(self, url):
        try:
            response = requests.get(url, headers=self.headers)
            soup = BeautifulSoup(response.text, "html.parser")
            category = self.extract_category(url)
            
            tile_data = []
            for tile in soup.select(".product-listing-col"):
                try:
                    name_element = tile.select_one(".simi-name a")
                    name = name_element.text.strip() if name_element else None
                    if not name:
                        continue
                        
                    figure = tile.select_one(".cate-pro-figure")
                    high_quality_image = None
                    if figure:
                        img = figure.select_one("img")
                        if img:
                            high_quality_image = img.get('data-src') or img.get('src')
                    
                    local_image_path = None
                    if high_quality_image and name:
                        local_image_path = self.download_image(high_quality_image, name)
                    
                    tile_data.append({
                        "Name": name,
                        "URL": name_element["href"] if name_element else None,
                        "Size": tile.select_one(".produ-size-leftbox").text.strip().replace("Size", "").strip() if tile.select_one(".produ-size-leftbox") else None,
                        "Availability": tile.select_one(".produ-size-rightbox span").text.strip() if tile.select_one(".produ-size-rightbox span") else None,
                        "Price": tile.select_one(".produ-price").text.strip() if tile.select_one(".produ-price") else "Price not available",
                        "Image URL": high_quality_image,
                        "Image Path": local_image_path if local_image_path else "",
                        "Category": category
                    })
                    
                except Exception as e:
                    print(f"Error processing tile in {url}: {e}")
                    continue
                    
            return tile_data
            
        except Exception as e:
            print(f"Error scraping {url}: {e}")
            return []

    def scrape_from_route_list(self, routes_file):
        try:
            # Read routes from file
            with open(routes_file, 'r') as f:
                routes = [line.strip() for line in f if line.strip()]
            
            print(f"Found {len(routes)} routes to scrape")
            all_data = []
            
            # Process each route
            for i, route in enumerate(routes, 1):
                print(f"Processing route {i}/{len(routes)}: {route}")
                data = self.scrape_page(route)
                all_data.extend(data)
                print(f"Found {len(data)} tiles in this route")
                
                # Save progress periodically
                if i % 10 == 0:
                    self.save_to_csv(all_data, 'tiles_data_progress.csv')
            
            # Final save
            self.save_to_csv(all_data, 'tiles_data_final.csv')
            print(f"\nTotal tiles scraped: {len(all_data)}")
            
        except Exception as e:
            print(f"Error in main scraping process: {e}")
            # Save whatever data we have
            if all_data:
                self.save_to_csv(all_data, 'tiles_data_error.csv')

    def save_to_csv(self, data, filename):
        df = pd.DataFrame(data)
        df.to_csv(filename, index=False, quoting=csv.QUOTE_ALL)
        print(f"Data saved to {filename}")

if __name__ == "__main__":
    scraper = TileScraper()
    # Create a text file 'routes.txt' with one URL per line
    scraper.scrape_from_route_list('routes.txt')

Found 453 routes to scrape
Processing route 1/453: https://orientbell.com/tiles/wall-tiles
Found 25 tiles in this route
Processing route 2/453: https://orientbell.com/tiles/floor-tiles
Found 25 tiles in this route
Processing route 3/453: https://orientbell.com/tiles/liquidation-tiles
Found 2 tiles in this route
Processing route 4/453: https://orientbell.com/tiles/bathroom-tiles
Found 25 tiles in this route
Processing route 5/453: https://orientbell.com/tiles/kitchen-tiles
Found 25 tiles in this route
Processing route 6/453: https://orientbell.com/tiles/parking-tiles
Found 25 tiles in this route
Processing route 7/453: https://orientbell.com/tiles/elevation-tiles
Found 25 tiles in this route
Processing route 8/453: https://orientbell.com/tiles/bedroom-tiles
Found 25 tiles in this route
Processing route 9/453: https://orientbell.com/tiles/outdoor-tiles
Found 25 tiles in this route
Processing route 10/453: https://orientbell.com/tiles/terrace-tiles
Found 25 tiles in this route
Data saved 

In [1]:
# Load model directly
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

processor = AutoProcessor.from_pretrained("ZeeshanGeoPk/haitian-speech-to-text")
model = AutoModelForSpeechSeq2Seq.from_pretrained("ZeeshanGeoPk/haitian-speech-to-text")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import torch
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import librosa

# Install required libraries
# !pip install transformers librosa torch

# Load processor and model
processor = AutoProcessor.from_pretrained("ZeeshanGeoPk/haitian-speech-to-text")
model = AutoModelForSpeechSeq2Seq.from_pretrained("ZeeshanGeoPk/haitian-speech-to-text")

# Load an audio file
# Replace 'your_audio_file.wav' with the path to your audio file
audio_path = 'your_audio_file.wav'
speech, sr = librosa.load(audio_path, sr=16000)  # Load the audio at 16 kHz

# Ensure the audio is in the correct shape
inputs = processor(speech, sampling_rate=16000, return_tensors="pt")

# Generate predictions
with torch.no_grad():
    generated_ids = model.generate(inputs['input_features'])

# Decode the predictions
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print("Transcription:", transcription)

# Notes:
# - Ensure the audio file is in a compatible format, such as WAV.
# - Install necessary libraries like `librosa` for audio processing and `transformers` for the model.
# - Replace `your_audio_file.wav` with the actual path to your audio file.
# - This example uses a pre-trained Haitian speech-to-text model.
