# data collecting

In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess
import re
import requests
from scrapy import Selector
import pandas as pd
from tqdm.notebook import tqdm
import os



# lamps-lightings

In [2]:

from tqdm.notebook import tqdm

import time


save_dir = r"C:/Project/Data"
os.makedirs(save_dir, exist_ok=True)


collection_url = "https://vaaree.com/collections/lamps-lightings"
html_content = requests.get(collection_url).text  
selector = Selector(text=html_content)

product_links = selector.css('a.absolute.start-0.top-0.-z-50.h-0.w-0.overflow-hidden::attr(href)').getall()
product_urls = ["https://vaaree.com" + link for link in product_links if link.startswith('/products')]


names = selector.css('h3::text').getall()
types = selector.css('p.body6.font-light.line-clamp-2::text').getall()
current_prices = selector.css('p.body4.font-semibold::text').getall()

details = selector.css('div p.body8::text').getall()
previous_prices = [item for item in details if re.match(r'₹[\d,]+', item)]
discounts = selector.css('p.body8.font-semibold::text').getall()
delivery_info = selector.css('span.font-semibold::text').getall()

max_entries = max(len(names), len(types), len(current_prices), len(product_urls), len(previous_prices), len(discounts), len(delivery_info))


def extend_list(lst, max_entries):
    return lst + [None] * (max_entries - len(lst))

names = extend_list(names, max_entries)
types = extend_list(types, max_entries)
current_prices = extend_list(current_prices, max_entries)
product_urls = extend_list(product_urls, max_entries)
previous_prices = extend_list(previous_prices, max_entries)
discounts = extend_list(discounts, max_entries)
delivery_info = extend_list(delivery_info, max_entries)
category = ["Lamps & Lighting"] * max_entries


ratings = []
reviews = []


for url in tqdm(product_urls[:24], desc="Scraping Products"):
    try:
        product_html = requests.get(url).text
        product_selector = Selector(text=product_html)


        rating_list = product_selector.css('p.body6.body6::text').getall()
        rating = rating_list[3] if len(rating_list) >3 else "No Rating"

        review_list = product_selector.css('p.body6.body6::text').getall()
        review = review_list[4] if len(review_list) >4 else "No Review"

        ratings.append(rating)
        reviews.append(review)

       

    except Exception as e:
        print(f"Error scraping {url}: {e}")
        ratings.append("Error")
        reviews.append("Error")


df = pd.DataFrame({
    'Product Title': names[:24],
    'Type': types[:24],
    'Current Price': current_prices[:24],
    'Product URL': product_urls[:24],
    'Original Price': previous_prices[:24],
    'Discount': discounts[:24],
    'Estimated Delivery': delivery_info[:24],
    'Category': category[:24],
    'Rating': ratings,
    'Reviews': reviews
})

file_path = os.path.join(save_dir, "vaaree_lamps_lightings.csv")
df.to_csv(file_path, index=False)

print(f"✅ Data successfully saved to {file_path}")


Scraping Products:   0%|          | 0/24 [00:00<?, ?it/s]

✅ Data successfully saved to C:/Project/Data\vaaree_lamps_lightings.csv


# wall-decor

In [3]:


save_dir = r"C:/Project/Data"
os.makedirs(save_dir, exist_ok=True)

collection_url = "https://vaaree.com/collections/wall-decor"
html_content = requests.get(collection_url).text
selector = Selector(text=html_content)

product_links = selector.css('a.absolute.start-0.top-0.-z-50.h-0.w-0.overflow-hidden::attr(href)').getall()
product_urls = ["https://vaaree.com" + link for link in product_links if link.startswith('/products')]

names = selector.css('h3::text').getall()
types = selector.css('p.body6.font-light.line-clamp-2::text').getall()
current_prices = selector.css('p.body4.font-semibold::text').getall()
details = selector.css('div p.body8::text').getall()
previous_prices = [item for item in details if re.match(r'₹[\d,]+', item)]
discounts = selector.css('p.body8.font-semibold::text').getall()
delivery_info = selector.css('span.font-semibold::text').getall()

max_entries = max(len(names), len(types), len(current_prices), len(product_urls), len(previous_prices), len(discounts), len(delivery_info))

def extend_list(lst, max_entries):
    return lst + [None] * (max_entries - len(lst))

names = extend_list(names, max_entries)
types = extend_list(types, max_entries)
current_prices = extend_list(current_prices, max_entries)
product_urls = extend_list(product_urls, max_entries)
previous_prices = extend_list(previous_prices, max_entries)
discounts = extend_list(discounts, max_entries)
delivery_info = extend_list(delivery_info, max_entries)
category = ["Wall Decor"] * max_entries

ratings = []
reviews = []

for url in product_urls[:24]:
    try:
        product_html = requests.get(url).text
        product_selector = Selector(text=product_html)

        rating_list = product_selector.css('p.body6.body6::text').getall()
        rating = rating_list[3] if len(rating_list) > 3 else "No Rating"

        review_list = product_selector.css('p.body6.body6::text').getall()
        review = review_list[4] if len(review_list) > 4 else "No Review"

        ratings.append(rating)
        reviews.append(review)

       

    except:
        ratings.append("Error")
        reviews.append("Error")

df = pd.DataFrame({
    'Product Title': names[:24],
    'Type': types[:24],
    'Current Price': current_prices[:24],
    'Product URL': product_urls[:24],
    'Original Price': previous_prices[:24],
    'Discount': discounts[:24],
    'Estimated Delivery': delivery_info[:24],
    'Category': category[:24],
    'Rating': ratings,
    'Reviews': reviews
})

file_path = os.path.join(save_dir, "vaaree_walldecor.csv")
df.to_csv(file_path, index=False)

print(f"Data successfully saved to {file_path}")


Data successfully saved to C:/Project/Data\vaaree_walldecor.csv


# artificial-flowers-plants

In [5]:

save_dir = r"C:/Project/Data" 
os.makedirs(save_dir, exist_ok=True)

collection_url = "https://vaaree.com/collections/artificial-flowers-plants"
html_content = requests.get(collection_url).text
selector = Selector(text=html_content)

product_links = selector.css('a.absolute.start-0.top-0.-z-50.h-0.w-0.overflow-hidden::attr(href)').getall()
product_urls = ["https://vaaree.com" + link for link in product_links if link.startswith('/products')]

names = selector.css('h3::text').getall()
types = selector.css('p.body6.font-light.line-clamp-2::text').getall()
current_prices = selector.css('p.body4.font-semibold::text').getall()
details = selector.css('div p.body8::text').getall()
previous_prices = [item for item in details if re.match(r'₹[\d,]+', item)]
discounts = selector.css('p.body8.font-semibold::text').getall()
delivery_info = selector.css('span.font-semibold::text').getall()

max_entries = max(len(names), len(types), len(current_prices), len(product_urls), len(previous_prices), len(discounts), len(delivery_info))

def extend_list(lst, max_entries):
    return lst + [None] * (max_entries - len(lst))

names = extend_list(names, max_entries)
types = extend_list(types, max_entries)
current_prices = extend_list(current_prices, max_entries)
product_urls = extend_list(product_urls, max_entries)
previous_prices = extend_list(previous_prices, max_entries)
discounts = extend_list(discounts, max_entries)
delivery_info = extend_list(delivery_info, max_entries)
category = ["Artificial Flowers & Plants"] * max_entries

ratings = []
reviews = []

for url in product_urls[:24]:
    try:
        product_html = requests.get(url).text
        product_selector = Selector(text=product_html)

        rating_list = product_selector.css('p.body6.body6::text').getall()
        rating = rating_list[3] if len(rating_list)  else "No Rating"

        review_list = product_selector.css('p.body6.body6::text').getall()
        review = review_list[4] if len(review_list)  else "No Review"

        ratings.append(rating)
        reviews.append(review)

        time.sleep(0.1)

    except:
        ratings.append("Error")
        reviews.append("Error")

df = pd.DataFrame({
    'Product Title': names[:24],
    'Type': types[:24],
    'Current Price': current_prices[:24],
    'Product URL': product_urls[:24],
    'Original Price': previous_prices[:24],
    'Discount': discounts[:24],
    'Estimated Delivery': delivery_info[:24],
    'Category': category[:24],
    'Rating': ratings,
    'Reviews': reviews
})

file_path = os.path.join(save_dir, "vaaree_artificial_flowers_plants.csv")
df.to_csv(file_path, index=False)

print(f"Data successfully saved to {file_path}")


Data successfully saved to C:/Project/Data\vaaree_artificial_flowers_plants.csv


# clocks

In [6]:

save_dir = r"C:/Project/Data"
os.makedirs(save_dir, exist_ok=True)

collection_url = "https://vaaree.com/collections/clocks"
html_content = requests.get(collection_url).text
selector = Selector(text=html_content)

product_links = selector.css('a.absolute.start-0.top-0.-z-50.h-0.w-0.overflow-hidden::attr(href)').getall()
product_urls = ["https://vaaree.com" + link for link in product_links if link.startswith('/products')]

names = selector.css('h3::text').getall()
types = selector.css('p.body6.font-light.line-clamp-2::text').getall()
current_prices = selector.css('p.body4.font-semibold::text').getall()
details = selector.css('div p.body8::text').getall()
previous_prices = [item for item in details if re.match(r'₹[\d,]+', item)]
discounts = selector.css('p.body8.font-semibold::text').getall()
delivery_info = selector.css('span.font-semibold::text').getall()

max_entries = max(len(names), len(types), len(current_prices), len(product_urls), len(previous_prices), len(discounts), len(delivery_info))

def extend_list(lst, max_entries):
    return lst + [None] * (max_entries - len(lst))

names = extend_list(names, max_entries)
types = extend_list(types, max_entries)
current_prices = extend_list(current_prices, max_entries)
product_urls = extend_list(product_urls, max_entries)
previous_prices = extend_list(previous_prices, max_entries)
discounts = extend_list(discounts, max_entries)
delivery_info = extend_list(delivery_info, max_entries)
category = ["Clocks"] * max_entries

ratings = []
reviews = []

for url in product_urls[:24]:
    try:
        product_html = requests.get(url).text
        product_selector = Selector(text=product_html)

        rating_list = product_selector.css('p.body6.body6::text').getall()
        rating = rating_list[3] if len(rating_list) > 2 else "No Rating"

        review_list = product_selector.css('p.body6.body6::text').getall()
        review = review_list[4] if len(review_list) > 4 else "No Review"

        ratings.append(rating)
        reviews.append(review)

        time.sleep(0.5)

    except:
        ratings.append("Error")
        reviews.append("Error")

df = pd.DataFrame({
    'Product Title': names[:24],
    'Type': types[:24],
    'Current Price': current_prices[:24],
    'Product URL': product_urls[:24],
    'Original Price': previous_prices[:24],
    'Discount': discounts[:24],
    'Estimated Delivery': delivery_info[:24],
    'Category': category[:24],
    'Rating': ratings,
    'Reviews': reviews
})

file_path = os.path.join(save_dir, "vaaree_clocks.csv")
df.to_csv(file_path, index=False)

print(f"Data successfully saved to {file_path}")


Data successfully saved to C:/Project/Data\vaaree_clocks.csv


# showpieces-vases

In [7]:


save_dir = r"C:/Project/Data"
os.makedirs(save_dir, exist_ok=True)

collection_url = "https://vaaree.com/collections/showpieces-vases"
html_content = requests.get(collection_url).text
selector = Selector(text=html_content)

product_links = selector.css('a.absolute.start-0.top-0.-z-50.h-0.w-0.overflow-hidden::attr(href)').getall()
product_urls = ["https://vaaree.com" + link for link in product_links if link.startswith('/products')]

names = selector.css('h3::text').getall()
types = selector.css('p.body6.font-light.line-clamp-2::text').getall()
current_prices = selector.css('p.body4.font-semibold::text').getall()
details = selector.css('div p.body8::text').getall()
previous_prices = [item for item in details if re.match(r'₹[\d,]+', item)]
discounts = selector.css('p.body8.font-semibold::text').getall()
delivery_info = selector.css('span.font-semibold::text').getall()

max_entries = max(len(names), len(types), len(current_prices), len(product_urls), len(previous_prices), len(discounts), len(delivery_info))

def extend_list(lst, max_entries):
    return lst + [None] * (max_entries - len(lst))

names = extend_list(names, max_entries)
types = extend_list(types, max_entries)
current_prices = extend_list(current_prices, max_entries)
product_urls = extend_list(product_urls, max_entries)
previous_prices = extend_list(previous_prices, max_entries)
discounts = extend_list(discounts, max_entries)
delivery_info = extend_list(delivery_info, max_entries)
category = ["Showpieces, Vases & Accent Bowls"] * max_entries

ratings = []
reviews = []

for url in product_urls[:24]:
    try:
        product_html = requests.get(url).text
        product_selector = Selector(text=product_html)

        rating_list = product_selector.css('p.body6.body6::text').getall()
        rating = rating_list[3] if len(rating_list) > 3 else "No Rating"

        review_list = product_selector.css('p.body6.body6::text').getall()
        review = review_list[4] if len(review_list) > 4 else "No Review"

        ratings.append(rating)
        reviews.append(review)

        time.sleep(0.5)

    except:
        ratings.append("Error")
        reviews.append("Error")

df = pd.DataFrame({
    'Product Title': names[:24],
    'Type': types[:24],
    'Current Price': current_prices[:24],
    'Product URL': product_urls[:24],
    'Original Price': previous_prices[:24],
    'Discount': discounts[:24],
    'Estimated Delivery': delivery_info[:24],
    'Category': category[:24],
    'Rating': ratings,
    'Reviews': reviews
})

file_path = os.path.join(save_dir, "vaaree_showpieces_vases.csv")
df.to_csv(file_path, index=False)

print(f"Data successfully saved to {file_path}")


Data successfully saved to C:/Project/Data\vaaree_showpieces_vases.csv


# mirrors

In [8]:

save_dir = r"C:/Project/Data"
os.makedirs(save_dir, exist_ok=True)

collection_url = "https://vaaree.com/collections/mirrors"
html_content = requests.get(collection_url).text
selector = Selector(text=html_content)

product_links = selector.css('a.absolute.start-0.top-0.-z-50.h-0.w-0.overflow-hidden::attr(href)').getall()
product_urls = ["https://vaaree.com" + link for link in product_links if link.startswith('/products')]

names = selector.css('h3::text').getall()
types = selector.css('p.body6.font-light.line-clamp-2::text').getall()
current_prices = selector.css('p.body4.font-semibold::text').getall()
details = selector.css('div p.body8::text').getall()
previous_prices = [item for item in details if re.match(r'₹[\d,]+', item)]
discounts = selector.css('p.body8.font-semibold::text').getall()
delivery_info = selector.css('span.font-semibold::text').getall()

max_entries = max(len(names), len(types), len(current_prices), len(product_urls), len(previous_prices), len(discounts), len(delivery_info))

def extend_list(lst, max_entries):
    return lst + [None] * (max_entries - len(lst))

names = extend_list(names, max_entries)
types = extend_list(types, max_entries)
current_prices = extend_list(current_prices, max_entries)
product_urls = extend_list(product_urls, max_entries)
previous_prices = extend_list(previous_prices, max_entries)
discounts = extend_list(discounts, max_entries)
delivery_info = extend_list(delivery_info, max_entries)
category = ["Mirrors"] * max_entries

ratings = []
reviews = []

for url in product_urls[:24]:
    try:
        product_html = requests.get(url).text
        product_selector = Selector(text=product_html)

        rating_list = product_selector.css('p.body6.body6::text').getall()
        rating = rating_list[3] if len(rating_list) > 3 else "No Rating"

        review_list = product_selector.css('p.body6.body6::text').getall()
        review = review_list[4] if len(review_list) > 4 else "No Review"

        ratings.append(rating)
        reviews.append(review)

        time.sleep(0.5)

    except:
        ratings.append("Error")
        reviews.append("Error")

df = pd.DataFrame({
    'Product Title': names[:24],
    'Type': types[:24],
    'Current Price': current_prices[:24],
    'Product URL': product_urls[:24],
    'Original Price': previous_prices[:24],
    'Discount': discounts[:24],
    'Estimated Delivery': delivery_info[:24],
    'Category': category[:24],
    'Rating': ratings,
    'Reviews': reviews
})

file_path = os.path.join(save_dir, "vaaree_mirrors.csv")
df.to_csv(file_path, index=False)

print(f"Data successfully saved to {file_path}")


Data successfully saved to C:/Project/Data\vaaree_mirrors.csv


# candle-holders-tealight-candle-holders

In [9]:

save_dir = r"C:/Project/Data"
os.makedirs(save_dir, exist_ok=True)

collection_url = "https://vaaree.com/collections/candle-holders-tealight-candle-holders"
html_content = requests.get(collection_url).text
selector = Selector(text=html_content)

product_links = selector.css('a.absolute.start-0.top-0.-z-50.h-0.w-0.overflow-hidden::attr(href)').getall()
product_urls = ["https://vaaree.com" + link for link in product_links if link.startswith('/products')]

names = selector.css('h3::text').getall()
types = selector.css('p.body6.font-light.line-clamp-2::text').getall()
current_prices = selector.css('p.body4.font-semibold::text').getall()
details = selector.css('div p.body8::text').getall()
previous_prices = [item for item in details if re.match(r'₹[\d,]+', item)]
discounts = selector.css('p.body8.font-semibold::text').getall()
delivery_info = selector.css('span.font-semibold::text').getall()

max_entries = max(len(names), len(types), len(current_prices), len(product_urls), len(previous_prices), len(discounts), len(delivery_info))

def extend_list(lst, max_entries):
    return lst + [None] * (max_entries - len(lst))

names = extend_list(names, max_entries)
types = extend_list(types, max_entries)
current_prices = extend_list(current_prices, max_entries)
product_urls = extend_list(product_urls, max_entries)
previous_prices = extend_list(previous_prices, max_entries)
discounts = extend_list(discounts, max_entries)
delivery_info = extend_list(delivery_info, max_entries)
category = ["Candle Stand & Holders"] * max_entries

ratings = []
reviews = []

for url in product_urls[:24]:
    try:
        product_html = requests.get(url).text
        product_selector = Selector(text=product_html)

        rating_list = product_selector.css('p.body6.body6::text').getall()
        rating = rating_list[3] if len(rating_list) > 3 else "No Rating"

        review_list = product_selector.css('p.body6.body6::text').getall()
        review = review_list[4] if len(review_list) > 4 else "No Review"

        ratings.append(rating)
        reviews.append(review)

        time.sleep(0.5)

    except:
        ratings.append("Error")
        reviews.append("Error")

df = pd.DataFrame({
    'Product Title': names[:24],
    'Type': types[:24],
    'Current Price': current_prices[:24],
    'Product URL': product_urls[:24],
    'Original Price': previous_prices[:24],
    'Discount': discounts[:24],
    'Estimated Delivery': delivery_info[:24],
    'Category': category[:24],
    'Rating': ratings,
    'Reviews': reviews
})

file_path = os.path.join(save_dir, "vaaree_candle_holders.csv")
df.to_csv(file_path, index=False)

print(f"Data successfully saved to {file_path}")


Data successfully saved to C:/Project/Data\vaaree_candle_holders.csv


# religious-spiritual-items

In [10]:


save_dir = r"C:/Project/Data"
os.makedirs(save_dir, exist_ok=True)

collection_url = "https://vaaree.com/collections/religious-spiritual-items"
html_content = requests.get(collection_url).text
selector = Selector(text=html_content)

product_links = selector.css('a.absolute.start-0.top-0.-z-50.h-0.w-0.overflow-hidden::attr(href)').getall()
product_urls = ["https://vaaree.com" + link for link in product_links if link.startswith('/products')]

names = selector.css('h3::text').getall()
types = selector.css('p.body6.font-light.line-clamp-2::text').getall()
current_prices = selector.css('p.body4.font-semibold::text').getall()
details = selector.css('div p.body8::text').getall()
previous_prices = [item for item in details if re.match(r'₹[\d,]+', item)]
discounts = selector.css('p.body8.font-semibold::text').getall()
delivery_info = selector.css('span.font-semibold::text').getall()

max_entries = max(len(names), len(types), len(current_prices), len(product_urls), len(previous_prices), len(discounts), len(delivery_info))

def extend_list(lst, max_entries):
    return lst + [None] * (max_entries - len(lst))

names = extend_list(names, max_entries)
types = extend_list(types, max_entries)
current_prices = extend_list(current_prices, max_entries)
product_urls = extend_list(product_urls, max_entries)
previous_prices = extend_list(previous_prices, max_entries)
discounts = extend_list(discounts, max_entries)
delivery_info = extend_list(delivery_info, max_entries)
category = ["Religious & Spiritual Items"] * max_entries

ratings = []
reviews = []

for url in product_urls[:24]:
    try:
        product_html = requests.get(url).text
        product_selector = Selector(text=product_html)

        rating_list = product_selector.css('p.body6.body6::text').getall()
        rating = rating_list[3] if len(rating_list) > 3 else "No Rating"

        review_list = product_selector.css('p.body6.body6::text').getall()
        review = review_list[4] if len(review_list) > 4 else "No Review"

        ratings.append(rating)
        reviews.append(review)

       

    except:
        ratings.append("Error")
        reviews.append("Error")

df = pd.DataFrame({
    'Product Title': names[:24],
    'Type': types[:24],
    'Current Price': current_prices[:24],
    'Product URL': product_urls[:24],
    'Original Price': previous_prices[:24],
    'Discount': discounts[:24],
    'Estimated Delivery': delivery_info[:24],
    'Category': category[:24],
    'Rating': ratings,
    'Reviews': reviews
})

file_path = os.path.join(save_dir, "vaaree_religious_spiritual_items.csv")
df.to_csv(file_path, index=False)

print(f"Data successfully saved to {file_path}")


Data successfully saved to C:/Project/Data\vaaree_religious_spiritual_items.csv


In [11]:

data_dir = r"C:/Project/Data"  
all_files = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file.endswith('.csv')]
df = pd.concat((pd.read_csv(file) for file in all_files), ignore_index=True)
output_file = os.path.join(data_dir, "merged_vaaree_data.csv")
df.to_csv(output_file, index=False)




In [12]:
df = pd.read_csv("C:\\Project\\Data\\merged_vaaree_data.csv")
df

Unnamed: 0,Product Title,Type,Current Price,Product URL,Original Price,Discount,Estimated Delivery,Category,Rating,Reviews
0,Bloom Blush Planter - Set Of Two,Metal,₹799,https://vaaree.com/products/bloom-blush-plante...,"₹2,999",74% off,Delivery Tomorrow,Artificial Flowers & Plants,4.2,5 Reviews
1,Smiley Swing Planter,Resin,₹890,https://vaaree.com/products/smiley-swing-planter,₹990,11% off,"Mon, 10th Mar",Artificial Flowers & Plants,5,2 Reviews
2,(Coffee & White) Faux Pampas Grass Sticks (45....,Plastic,₹899,https://vaaree.com/products/faux-pampas-grass-...,"₹3,200",72% off,"Mon, 10th Mar",Artificial Flowers & Plants,4,1 Reviews
3,Faux Tropic Anthurium Silk Plant With Pot - 2....,PVC,₹959,https://vaaree.com/products/faux-tropic-anthur...,"₹1,750",46% off,"Mon, 10th Mar",Artificial Flowers & Plants,3,3 Reviews
4,Faux Purple Flower Vine With Metal Wall Stand,Plastic,₹699,https://vaaree.com/products/faux-purple-flower...,"₹1,299",47% off,"Mon, 10th Mar",Artificial Flowers & Plants,5,2 Reviews
...,...,...,...,...,...,...,...,...,...,...
187,Shyam Art Painting - Set Of Two,Wood,₹540,https://vaaree.com/products/shyam-art-painting...,"₹2,000",73% off,"Mon, 10th Mar",Wall Decor,4.0,2 Reviews
188,Gold Leaf Tree Wall Accent,Iron,"₹1,055",https://vaaree.com/products/gold-leaf-tree-wal...,"₹1,470",29% off,Delivery Tomorrow,Wall Decor,3.7,3 Reviews
189,Memories Encased Photo Frame (Black) - Set Of Ten,MDF,"₹1,027",https://vaaree.com/products/memories-encased-p...,"₹2,000",49% off,"Mon, 10th Mar",Wall Decor,3.0,2 Reviews
190,Root Riva Wall Accent,Iron,"₹1,271",https://vaaree.com/products/root-riva-wall-accent,"₹1,770",29% off,Delivery Tomorrow,Wall Decor,3.8,4 Reviews
