In [3]:
pip install selenium beautifulsoup4 pandas


Note: you may need to restart the kernel to use updated packages.


In [4]:
import selenium
import bs4
import pandas


In [9]:
import requests
import pandas as pd

# 1. Hit the Shopify JSON endpoint (max 250 items per call)
url = "https://samsungparts.com/collections/air-conditioner-parts/products.json?limit=250"
resp = requests.get(url)
resp.raise_for_status()
products = resp.json()["products"]


In [10]:
# How many products did we get?
print(f"Found {len(products)} products")

# Peek at the first product’s structure
import json
print(json.dumps(products[0], indent=2))


Found 250 products
{
  "id": 8317142630673,
  "title": "Samsung DB92-02874A PCB ASSEMBLY SUB",
  "handle": "db92-02874a",
  "body_html": "<p>This genuine Samsung DB92-02874A PCB ASSEMBLY SUB is the perfect choice for replacing your current part in your device. This original Samsung part comes with a guarantee of quality and reliability, to ensure that your device functions properly and optimally. Enhance the performance of your device with this stylish, long-lasting replacement part from Samsung. Don't settle for anything less than excellence \u2014 make sure you have the best with this Samsung original part. Enjoy lasting performance and style without compromising on quality!</p>",
  "published_at": "2023-04-18T04:17:02-04:00",
  "created_at": "2023-04-18T04:17:02-04:00",
  "updated_at": "2025-06-22T07:04:36-04:00",
  "vendor": "Samsung",
  "product_type": "Part",
  "tags": [
    "gmc",
    "havingImg",
    "subcate:Air Conditioner",
    "types:PCB",
    "webcat:PCB"
  ],
  "variants"

In [11]:
# Build our simplified DataFrame
data = []
for p in products:
    var = p["variants"][0]           # first (and usually only) variant
    data.append({
        "part_name": p["title"],
        "part_number": p["handle"].upper(),             # e.g. DB93-01861-PCB-ASSEMBLY... 
        "price_usd": var["price"],
        "product_url": f"https://samsungparts.com/products/{p['handle']}",
        "description_html": p["body_html"],
        "tags": ";".join(p["tags"]),
        "category": "Air Conditioner"
    })

df = pd.DataFrame(data)
print(df.head())               # preview first 5 rows
df.to_csv("samsung_ac_parts_full.csv", index=False)
print(f"✅ Saved {len(df)} parts to samsung_ac_parts_full.csv")


                                           part_name  part_number price_usd  \
0               Samsung DB92-02874A PCB ASSEMBLY SUB  DB92-02874A     53.95   
1  Samsung DB95-04871B Assembly Thermistor Out-Su...  DB95-04871B    101.95   
2              Samsung DB92-03322A Main Pcb Assembly  DB92-03322A    211.95   
3  Samsung DB32-00277A Sensor Temp;103Hw,L400,10Kohm  DB32-00277A    130.95   
4  Samsung DB95-04871A Assembly Thermistor Out-Su...  DB95-04871A     82.95   

                                     product_url  \
0  https://samsungparts.com/products/db92-02874a   
1  https://samsungparts.com/products/db95-04871b   
2  https://samsungparts.com/products/db92-03322a   
3  https://samsungparts.com/products/db32-00277a   
4  https://samsungparts.com/products/db95-04871a   

                                    description_html  \
0  <p>This genuine Samsung DB92-02874A PCB ASSEMB...   
1  <p>Samsung DB95-04871B Assembly Thermistor Out...   
2  <p>This Samsung DB92-03322A MAIN PCB ASSE

In [1]:
import re
import requests
import pandas as pd

# 1) Load your existing basic CSV
df = pd.read_csv("samsung_ac_parts_full.csv")

def scrape_supported_models(html: str) -> str:
    """
    Extract the block of text between the 'Supported Models' heading
    and the 'Shipping & Delivery' heading, then pull out all model codes.
    """
    # Collapse whitespace so regex is easier
    flat = re.sub(r"\s+", " ", html)

    # 2a. Grab the chunk between the two headings
    m = re.search(
        r"Supported Models(.*?)Shipping & Delivery",
        flat,
        flags=re.IGNORECASE | re.DOTALL
    )
    block = m.group(1) if m else ""

    # 2b. In that block, find all tokens that look like model codes:
    #     uppercase letters, digits, slash or dash, length ≥5
    codes = re.findall(r"\b[A-Z0-9/\-]{5,}\b", block)

    # 2c. Deduplicate while preserving order
    seen = set()
    uniq = []
    for c in codes:
        if c not in seen:
            seen.add(c)
            uniq.append(c)

    return "; ".join(uniq)

# 3) Iterate over each product URL and scrape
compatible_lists = []
for url in df["product_url"]:
    try:
        resp = requests.get(url, timeout=10)
        resp.raise_for_status()
    except Exception as e:
        print(f"⚠️  Could not fetch {url}: {e}")
        compatible_lists.append("")
        continue

    comp = scrape_supported_models(resp.text)
    compatible_lists.append(comp)

# 4) Attach the new column
df["compatible_models"] = compatible_lists

# 5) Select only the columns you want
final = df.loc[:, [
    "part_name",
    "part_number",
    "price_usd",
    "product_url",
    "compatible_models"
]]

# 6) Save the final CSV
final.to_csv("samsung_ac_parts_final.csv", index=False)
print(f"✅ Wrote {len(final)} rows to samsung_ac_parts_final.csv")


✅ Wrote 250 rows to samsung_ac_parts_final.csv


In [5]:
import re
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# 1) Load the basic parts CSV
df = pd.read_csv("samsung_ac_parts_full.csv")

# 2) Setup Selenium (non-headless to be safe)
options = Options()
# options.add_argument("--headless")  # you can re-enable headless once it works
options.add_argument("--window-size=1920,1080")
driver = webdriver.Chrome(options=options)

def get_supported_models(url):
    driver.get(url)
    wait = WebDriverWait(driver, 10)

    # 3) Wait for the tab‐nav link to appear, then click it
    tab_btn = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'a[href="#proTabs3"]')))
    driver.execute_script("arguments[0].click();", tab_btn)

    # 4) Wait for the content pane to be visible
    panel = wait.until(EC.visibility_of_element_located((By.ID, "proTabs3")))
    inner = panel.get_attribute("innerHTML")

    # 5) Extract all model codes from that HTML
    codes = re.findall(r"\b[A-Z0-9/\-]{5,}\b", inner)
    # dedupe
    seen = set(); uniq = []
    for c in codes:
        if c not in seen:
            seen.add(c)
            uniq.append(c)
    return "; ".join(uniq)

# 6) Loop through each URL and collect
compatible = []
for url in df["product_url"]:
    try:
        cm = get_supported_models(url)
    except Exception as e:
        print(f"⚠️ Error on {url}: {e}")
        cm = ""
    compatible.append(cm)

driver.quit()

# 7) Attach and write final CSV
df["compatible_models"] = compatible
final = df[[
    "part_name", "part_number", "price_usd", "product_url", "compatible_models"
]]
final.to_csv("samsung_ac_parts_final.csv", index=False)
print(f"✅ Done! {len(final)} rows → samsung_ac_parts_final.csv")


⚠️ Error on https://samsungparts.com/products/db31-10119a: Message: 

⚠️ Error on https://samsungparts.com/products/db92-03337b: Message: 

⚠️ Error on https://samsungparts.com/products/db95-05011e: Message: 

⚠️ Error on https://samsungparts.com/products/db39-01263a: Message: 

⚠️ Error on https://samsungparts.com/products/db63-02758f: Message: 

⚠️ Error on https://samsungparts.com/products/db63-04223a: Message: 

⚠️ Error on https://samsungparts.com/products/db94-00066a: Message: 

⚠️ Error on https://samsungparts.com/products/db67-01166a: Message: 

⚠️ Error on https://samsungparts.com/products/db31-00665a: Message: 

⚠️ Error on https://samsungparts.com/products/db94-00119a: Message: 

⚠️ Error on https://samsungparts.com/products/db63-01765f: Message: 

⚠️ Error on https://samsungparts.com/products/db95-05164a: Message: 

⚠️ Error on https://samsungparts.com/products/db67-00243a: Message: 

⚠️ Error on https://samsungparts.com/products/db95-05011c: Message: 

⚠️ Error on https://

In [13]:
import pandas as pd
import sys

# ─── CONFIGURATION ───────────────────────────────────────────────────
ORDERS_CSV = "samsung_user_orders.csv"
SPARES_CSV = "samsung_dummy_spares.csv"

# ─── LOAD DATASETS ───────────────────────────────────────────────────
orders_df = pd.read_csv(ORDERS_CSV)
spares_df = pd.read_csv(SPARES_CSV)

# Ensure compatible_models column is string and non-null
spares_df['compatible_models'] = spares_df['compatible_models'].fillna("")

# ─── RECOMMENDATION FUNCTION ─────────────────────────────────────────
def recommend_spare_parts(invoice_number: str, fault_keyword: str, top_n: int = 3):
    """
    Given an invoice number and a faulty-part keyword,
    returns a DataFrame of up to top_n recommended spare parts.
    """
    # 1. Lookup order
    order = orders_df.loc[orders_df['invoice_number'] == invoice_number]
    if order.empty:
        raise ValueError(f"No order found for invoice '{invoice_number}'")

    category = order.iloc[0]['category']
    user_model = order.iloc[0]['product_model']

    # 2. Filter spares by category
    candidates = spares_df[spares_df['category'].str.lower() == category.lower()].copy()

    # 3. Score each candidate
    def score_row(row):
        model_match = 1 if user_model in row['compatible_models'] else 0
        keyword_match = 1 if fault_keyword.lower() in row['part_name'].lower() else 0
        price_score = -row['price_usd']  # cheaper parts rank higher
        return (model_match, keyword_match, price_score)

    candidates['score'] = candidates.apply(score_row, axis=1)

    # 4. Sort and return top_n
    ranked = candidates.sort_values('score', ascending=False)
    return ranked.head(top_n)[[
        'part_name',
        'part_number',
        'price_usd',
        'compatible_models',
        'product_url'
    ]]

# ─── MAIN SCRIPT ─────────────────────────────────────────────────────
if __name__ == '__main__':
    if len(sys.argv) != 3:
        print("Usage: python spare_part_recommender_model.py <INVOICE_NUMBER> <FAULT_KEYWORD>")
        sys.exit(1)

    invoice = sys.argv[1]
    keyword = sys.argv[2]

    try:
        results = recommend_spare_parts(invoice, keyword)
        print(f"\nTop {len(results)} recommendations for invoice '{invoice}', fault '{keyword}':")
        print(results.to_string(index=False))
    except ValueError as e:
        print(f"Error: {e}")


Usage: python spare_part_recommender_model.py <INVOICE_NUMBER> <FAULT_KEYWORD>


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
