# Myntra Product Information Scraper

Welcome to the Myntra Product Information Scraper! This Jupyter notebook contains a powerful Python script designed to collect detailed information about any product available on Myntra.

## How to Use:

1. **Enter Product Details:** In the input fields, provide the name of the product you're interested in and specify the number of pages to scrape.

2. **Initiate Scraping:** Click the "Scrape Data" button to initiate the web scraping process. The script will navigate through Myntra's pages, collecting brand names, model names, prices, and ratings.

3. **Data Cleaning:** The collected data is cleaned, and additional details such as discounted price, original price, discount percentage, and total reviews are extracted.

4. **Save Results:** The cleaned data is saved in an Excel file named 'myntra_cleaned_data.xlsx' for your convenience.

5. **Explore and Analyze:** Dive into the scraped data to analyze trends, make comparisons, and gather insights about your favorite products on Myntra.

Feel free to modify the script and adapt it for different products or purposes. Happy scraping! 🚀🛍️

In [4]:
import tkinter as tk
from tkinter import messagebox
from tkinter import PhotoImage
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

def scrape_data(product_name, num_pages):
    # Create a Chrome web driver instance and set up its service
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
    driver.maximize_window()

    # Define the URL you want to access
    url = "https://www.myntra.com/"

    # Open the specified URL in the web browser
    driver.get(url)
    time.sleep(3)

    # Click on search
    search = driver.find_element(By.XPATH, "/html/body/div[1]/div/div/header/div[2]/div[3]/input")
    search.click()
    time.sleep(3)
    search.send_keys(product_name)
    time.sleep(3)
    search.send_keys(Keys.ENTER)

    # Let's collect the data
    df_overall = pd.DataFrame(columns=["brand_Name", "model_name", "prices", "ratings"])

    for v in range(num_pages):
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        table = soup.find('div', class_='search-searchProductsContainer row-base')

        brand_name = [item.text for item in table.find_all("h3", class_='product-brand')]
        model = [item.text for item in table.find_all("h4", class_='product-product')]
        price = [item.text for item in table.find_all("div", class_='product-price')]
        rating = [item.text if item else None for item in table.find_all("div", class_='product-ratingsContainer')]

        brand_series = pd.Series(brand_name, name="brand_Name")
        model_series = pd.Series(model, name="model_name")
        price_series = pd.Series(price, name="prices")
        rating_series = pd.Series(rating, name="ratings")

        df = pd.concat([brand_series, model_series, price_series, rating_series], axis=1)
        df_overall = pd.concat([df_overall, df], ignore_index=True, sort=False)

        # Click on the next page
        next_button = WebDriverWait(driver, 3).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, '#desktopSearchResults > div.search-searchProductsContainer.row-base > section > div.results-showMoreContainer > ul > li.pagination-next > a'))
        ).click()
        print("page_collected", v)

    print("ALL PAGES COLLECTED")
    driver.quit()

    # Clean the data
    data2 = df_overall
    pattern = r'Rs\. (\d+)(?:Rs\. (\d+))?(?:\((\d+)% OFF\))?(?:\(Rs\. (\d+) OFF\))?'
    extracted = data2['prices'].str.extract(pattern)
    extracted.columns = ['discounted_price', 'original_price', 'discount_percentage', 'discount_off']
    extracted = extracted.apply(pd.to_numeric, errors='coerce')
    data2 = pd.concat([data2, extracted], axis=1)
    data2 = data2.drop(columns=['prices'])
    data2['Total Reviews'] = data2['ratings'].replace({None: 0})
    data2['Total Reviews'] = data2['Total Reviews'].str.replace('k', '', regex=False)
#     decimals = data2['Total Reviews'] % 1 != 0
#     data2.loc[decimals, 'Total Reviews'] *= 1000
#     data2['Total Reviews'] = pd.to_numeric(data2['Total Reviews'], errors='coerce').fillna(0)
    
    data2['Total Reviews'] = data2['Total Reviews'].apply(lambda x: float(x.replace('k', '')) * 1000 if 'k' in str(x) else x)
    data2['Total Reviews'] = pd.to_numeric(data2['Total Reviews'], errors='coerce').fillna(0)
    
    data2['discount_percentage'].fillna((data2['original_price'] - data2['discounted_price']) / data2['original_price'] * 100,
                                         inplace=True)

    # Save the cleaned data to an Excel file
    data2.to_excel(r'D:\TeraBoxDownload\Python\data collection - myntra\myntra_cleaned_data.xlsx')

    messagebox.showinfo("Scraping Complete", "Data has been scraped and cleaned!")


def exit_program():
    root.destroy()
    
# Create the main window
root = tk.Tk()
root.title("Myntra Web Scraping")

# Add a description label
description_label = tk.Label(root, text="Welcome to My Myntra Web Scraping Tool!\nEnter the product name and the number of pages to scrape.")
description_label.pack(side=tk.TOP, pady=10)

# Load the Myntra wallpaper image
bg_image = PhotoImage(file="D:\TeraBoxDownload\Python\data collection - myntra\wallpaper.png")

# Set the background image
background_label = tk.Label(root, image=bg_image)
background_label.place(relwidth=1, relheight=1)

# Create labels and entry for product name
label_name = tk.Label(root, text="Enter Product Name:", bg='white')
label_name.pack(side=tk.LEFT, anchor='center')

product_name_entry = tk.Entry(root)
product_name_entry.pack(side=tk.LEFT, anchor='center')

# Create labels and entry for number of pages
label_pages = tk.Label(root, text="Enter Number of Pages:", bg='white')
label_pages.pack(side=tk.LEFT, anchor='center')

num_pages_entry = tk.Entry(root)
num_pages_entry.pack(side=tk.LEFT, anchor='center')

# Create a button to trigger web scraping
scrape_button = tk.Button(root, text="Scrape Data", command=lambda: scrape_data(product_name_entry.get(), int(num_pages_entry.get())))
scrape_button.pack(side=tk.LEFT, anchor='center')

# Create an exit button
exit_button = tk.Button(root, text="Exit", command=exit_program)
exit_button.pack()

# Start the GUI event loop
root.mainloop()


page_collected 0
page_collected 1
page_collected 2
page_collected 3
page_collected 4
page_collected 5
page_collected 6
page_collected 7
page_collected 8
page_collected 9
ALL PAGES COLLECTED
