In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('wordnet_ic')


perfume_data = pd.read_excel('class_perfume_data.xlsx')

perfume_data.dropna(inplace=True)


[nltk_data] Downloading package omw-1.4 to /Users/shadow/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/shadow/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet_ic to
[nltk_data]     /Users/shadow/nltk_data...
[nltk_data]   Package wordnet_ic is already up-to-date!


In [3]:
lemmatizer = WordNetLemmatizer()
def preprocess(notes):
    preprocessed_notes = []
    for element in notes:
        notes_list = []
        element = element.lower()  # Convert to lowercase
#         element = element.split(',') # Split string into words
#         for note in element:
#             note = lemmatizer.lemmatize(note, pos='n')  # Lemmatize nouns
#             notes_list.append(note) # Add processed words to current element
        preprocessed_notes.append(element) # Add current element to new Dataframe column
    return preprocessed_notes

perfume_data['PreprocessedNotes'] = preprocess(perfume_data['Notes'])


In [4]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(perfume_data['PreprocessedNotes'])

# Apply K-means clustering
num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(tfidf_matrix)

perfume_data['Cluster'] = kmeans.labels_



In [5]:
def calculate_cluster_similarity(query, cluster):
    query_vector = vectorizer.transform([query])
    cluster_data = perfume_data[perfume_data['Cluster'] == cluster]
    cluster_vectors = vectorizer.transform(cluster_data['PreprocessedNotes'])

    similarities = cosine_similarity(query_vector, cluster_vectors)[0]
    max_similarity = max(similarities)

    return max_similarity

In [6]:
def calculate_most_similar_cluster(query):
    max_similarity = -1
    most_similar_cluster = None

    for cluster in range(num_clusters):
        similarity = calculate_cluster_similarity(query, cluster)
        if similarity > max_similarity:
            max_similarity = similarity
            most_similar_cluster = cluster

    return most_similar_cluster

query = "something"
most_similar_cluster = calculate_most_similar_cluster(query)
print("Most similar cluster:", most_similar_cluster)


In [9]:
query = "citrus"
most_similar_cluster = calculate_most_similar_cluster(query)
print("Most similar cluster:", most_similar_cluster)

Most similar cluster: 0


In [3]:
import requests
from bs4 import BeautifulSoup
import csv


def scrape_sephora_perfumes():
    url = 'https://www.sephora.com/shop/perfume'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
    }
    
    # Send HTTP GET request and retrieve the HTML content
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract perfume data from HTML
    perfumes = soup.find_all('span', {'class': 'ProductTile-name css-h8cc3p eanm77i0'})
    
    # Store the extracted data in a CSV file
    with open('sephora_perfumes.csv', 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Name', 'Brand', 'Price'])
        
        for perfume in perfumes:
            name = perfume.find('span', {'class': 'ProductTile-name css-h8cc3p eanm77i0'}).text.strip()
            brand = perfume.find('span', {'class': 'css-12z2u5 eanm77i0'}).text.strip()
            price = perfume.find('span', {'class': 'css-0'}).text.strip()
            
            writer.writerow([name, brand, price])
        
    print("Scraped")
    

# Run the scraper
scrape_sephora_perfumes()

<!DOCTYPE html>
<html lang="en" class="css-r6sydd"><head data-comp="Head "><title>Perfume &amp; Perfumes for Women | Sephora</title><link rel="preload" as="image" href="/img/ufe/icons/me32.svg" media="(min-width: 992px)"/><link rel="preload" as="image" href="/img/ufe/icons/me-active.svg"/><script type="text/javascript" src="https://www.sephora.com/resources/6581a32414642a03f9e3b5f9aca755a11630011008b74" importance="low"></script><script>if (typeof global === "undefined") window.global = window;global.Sephora = global.Sephora || {};Sephora.Util = {};Sephora.Util.Perf = { loadEvents: [] };Sephora.Util.Perf.isReportSupported = function () {    return window.performance && typeof window.performance.mark === "function";};if(Sephora.Util.Perf.isReportSupported()) window.performance.mark("HeadscriptRuntime Loaded");</script><meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no"/><meta id="metaDescription" name="description" content="Shop per

Scraped


NameError: name 'response' is not defined