In [2]:
import pandas as pd
import requests
import imagehash
from PIL import Image
from io import BytesIO

# File paths
table_file = r'C:\Users\wawa1\Downloads\ENTRETIEN TECHNIQUE\technical_test_table_extract.csv'
external_file = r'C:\Users\wawa1\Downloads\ENTRETIEN TECHNIQUE\technical_test_external_source_extract.csv'
output_file = r'C:\Users\wawa1\Downloads\ENTRETIEN TECHNIQUE\FINAL.csv'

# Load datasets
table_df = pd.read_csv(table_file)
external_df = pd.read_csv(external_file)

# Function to download and hash an image
def get_image_hash(url):
    try:
        response = requests.get(url, timeout=20)
        if response.status_code == 200:
            image = Image.open(BytesIO(response.content)).convert("RGB")
            return str(imagehash.phash(image))
        else:
            print(f"Failed to download {url}: Status code {response.status_code}")
    except Exception as e:
        print(f"Error downloading {url}: {e}")
    return None

# Function to calculate Hamming distance
def hamming_distance(hash1, hash2):
    return sum(c1 != c2 for c1, c2 in zip(hash1, hash2))

# Function to find the best match based on Hamming distance
def find_best_match(table_hash, external_df, threshold=5):
    best_match = None
    min_distance = float('inf')
    
    for _, row in external_df.iterrows():
        external_hash = row["image_hash"]
        if pd.isna(external_hash):
            continue
        distance = hamming_distance(table_hash, external_hash)
        if distance < min_distance and distance <= threshold:
            min_distance = distance
            best_match = row
    
    return best_match

# Compute hashes for images
table_df["image_hash"] = table_df["product_url_img"].apply(get_image_hash)
external_df["image_hash"] = external_df["icon"].apply(get_image_hash)

# Drop rows where image hash is missing
table_df.dropna(subset=["image_hash"], inplace=True)
external_df.dropna(subset=["image_hash"], inplace=True)

# Match table_df rows with external_df rows using Hamming distance
matched_data = []
for _, table_row in table_df.iterrows():
    table_hash = table_row["image_hash"]
    best_match = find_best_match(table_hash, external_df)
    
    if best_match is not None:
        matched_data.append({
            "product_url_img": table_row["product_url_img"],
            "title": best_match["title"],
            "editor": best_match["editor"]
        })
    else:
        matched_data.append({
            "product_url_img": table_row["product_url_img"],
            "title": None,
            "editor": None
        })

# Create a DataFrame from the matched data
final_df = pd.DataFrame(matched_data)

# Debugging: Check merge results
print("Final Data Sample After Merge:\n", final_df.head())

# Check if title and editor exist
if "title" not in final_df.columns or "editor" not in final_df.columns:
    print("⚠️ ERROR: 'title' or 'editor' missing after merge. Check image hash matching.")

# Keep relevant columns
if "title" in final_df.columns and "editor" in final_df.columns:
    final_df.to_csv(output_file, index=False)
    print(f"✅ Enriched dataset saved to {output_file}")
else:
    print("❌ No matches found! Check image hashes or dataset formatting.")

Final Data Sample After Merge:
                                      product_url_img   title       editor
0  https://is5-ssl.mzstatic.com/image/thumb/Purpl...  Tinder  Tinder Inc.
1  https://is5-ssl.mzstatic.com/image/thumb/Purpl...  Tinder  Tinder Inc.
2  https://is5-ssl.mzstatic.com/image/thumb/Purpl...  Tinder  Tinder Inc.
3  https://is5-ssl.mzstatic.com/image/thumb/Purpl...  Tinder  Tinder Inc.
4  https://is5-ssl.mzstatic.com/image/thumb/Purpl...  Tinder  Tinder Inc.
✅ Enriched dataset saved to C:\Users\wawa1\Downloads\ENTRETIEN TECHNIQUE\FINAL.csv
