In [3]:
import requests
from bs4 import BeautifulSoup
from openai import OpenAI

client = OpenAI(api_key="your-api-key") # Replace with your OpenAI API key
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Function to crawl and extract page content
def crawl_url(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        # Extract title, meta description, and visible text as content
        title = soup.title.string.strip() if soup.title else ""
        meta_description = soup.find("meta", attrs={"name": "description"})
        meta_content = meta_description["content"].strip() if meta_description else ""
        body_text = " ".join(p.get_text(strip=True) for p in soup.find_all("p"))
        return f"{title}\n{meta_content}\n{body_text}"
    except Exception as e:
        print(f"Error crawling {url}: {e}")
        return ""

# Function to generate embeddings using OpenAI's new API
def generate_embedding(content):
    try:
        response = client.embeddings.create(model="text-embedding-ada-002", input=content)
        return response.data[0].embedding
    except Exception as e:
        print(f"Error generating embedding: {e}")
        return None

# Function to calculate cosine similarity between embeddings
def calculate_similarity(embeddings):
    similarity_matrix = cosine_similarity(embeddings)
    return similarity_matrix

def main(input_file, output_file):
    # Read input URLs from the Excel file
    df = pd.read_excel(input_file)
    urls = df["URL"].dropna().tolist()

    # Crawl URLs and generate embeddings
    url_content = []
    embeddings = []
    for url in urls:
        print(f"Crawling: {url}")
        content = crawl_url(url)
        if content:
            print(f"Generating embedding for: {url}")
            embedding = generate_embedding(content)
            if embedding:
                url_content.append({"URL": url, "Content": content, "Embedding": embedding})
                embeddings.append(embedding)
            else:
                print(f"Failed to generate embedding for: {url}")

    # Check if embeddings are available
    if not embeddings:
        print("No embeddings generated. Exiting.")
        return

    # Convert embeddings to a NumPy array
    embedding_array = np.array(embeddings)

    # Calculate similarity matrix
    similarity_matrix = calculate_similarity(embedding_array)

    # Prepare similarity results
    results = []
    for i, url_data in enumerate(url_content):
        for j, similarity_score in enumerate(similarity_matrix[i]):
            if i != j and similarity_score > 0.8:  # Threshold for semantic similarity
                results.append({
                    "URL 1": url_content[i]["URL"],
                    "URL 2": url_content[j]["URL"],
                    "Similarity": similarity_score
                })

    # Save results to Excel
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        # Tab 1: Similarity Check
        results_df = pd.DataFrame(results)
        results_df.to_excel(writer, sheet_name="Similarity Check", index=False)

        # Tab 2: Embeddings and Content
        embedding_content_df = pd.DataFrame([
            {
                "URL": item["URL"],
                "Content": item["Content"],
                "Embedding": str(item["Embedding"])  # Convert embedding array to string
            } for item in url_content
        ])
        embedding_content_df.to_excel(writer, sheet_name="Embeddings and Content", index=False)

    print(f"Results saved to {output_file} with two tabs: 'Similarity Check' and 'Embeddings and Content'")

if __name__ == "__main__":
    input_file = "input_urls.xlsx"  # Replace with your input Excel file
    output_file = "similarity_results.xlsx"  # Output file for similarity results
    main(input_file, output_file)

Crawling: https://www.sportsexperts.ca/en-CA/p-air-force-1-07-mens-fashion-shoes/786181/786181-57
Error crawling https://www.sportsexperts.ca/en-CA/p-air-force-1-07-mens-fashion-shoes/786181/786181-57: 403 Client Error: Forbidden for url: https://www.sportsexperts.ca/en-CA/p-air-force-1-07-mens-fashion-shoes/786181/786181-57
Crawling: https://www.nike.com/ca/t/air-force-1-07-shoes-rWtqPn
Generating embedding for: https://www.nike.com/ca/t/air-force-1-07-shoes-rWtqPn
Crawling: https://www.nike.com/ca/t/air-force-1-07-lx-shoes-DfvjfX
Generating embedding for: https://www.nike.com/ca/t/air-force-1-07-lx-shoes-DfvjfX
Crawling: https://www.simons.com/en/men-footwear/sneakers/air-force-1-07-sneakers-men--5821-21101
Error crawling https://www.simons.com/en/men-footwear/sneakers/air-force-1-07-sneakers-men--5821-21101: 403 Client Error: Forbidden for url: https://www.simons.com/en/men-footwear/sneakers/air-force-1-07-sneakers-men--5821-21101
Crawling: https://jdsports.ca/products/nike-air-forc