In [1]:
import os
import time
import json
import urllib.request
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse

import pandas as pd

from tqdm import tqdm

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys

from bs4 import BeautifulSoup

In [2]:
chrome_options = Options()
driver = webdriver.Chrome(options=chrome_options)
driver.get('https://www.myntra.com/')
time.sleep(5)

In [3]:
def get_results(search_text, max_pages=5):
    
    search_input = driver.find_element('css selector', '.desktop-searchBar')
    search_input.send_keys(search_text)
    search_input.send_keys(Keys.RETURN) 
    
    search_url = driver.current_url
    
    images = []
    names = []
    prices = []
    ratings = []
    brands = []
    search = []
    
    for page in range(max_pages):
    
        parsed_url = urlparse(search_url)
        query_params = parse_qs(parsed_url.query)
        query_params['p'] = [f'{page+1}']
        modified_query = urlencode(query_params, doseq=True)
        modified_url_parts = parsed_url._replace(query=modified_query)
        curr_url = urlunparse(modified_url_parts)
        
        try:
            driver.get(curr_url)
        except:
            break
            
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        products = soup.find_all('li', class_='product-base')

        for product in products:
                
            try:
                name = product.find('h4', class_='product-product').text.strip()
            except:
                name = "NA"
                
            try:
                price = product.find('div', class_='product-price').text.strip().split("Rs. ")[1]
            except:
                price = "NA"
                
            try:
                rating = product.find('div', class_='product-ratingsContainer').find('span').text.strip()
            except:
                rating = "NA"
                
            try:
                brand = product.find('h3', class_='product-brand').text.strip()
            except:
                brand = "NA"
                
                
            names.append(name)
            prices.append(price)
            ratings.append(rating)
            brands.append(brand)
            search.append(search_text)
        
    df_temp = pd.DataFrame()
    df_temp["name"] = names
    df_temp["brand"] = brands
    df_temp["price"] = prices
    df_temp["rating"] = ratings
    df_temp["search_input"] = search
    
    return df_temp
        

In [4]:
search_texts = [
    "Socks",
    "Shorts men",
    "Shorts women",
    "Shirts men",
    "Shirts women",
    "Belts",
    "Ethnicwear men",
    "Ethnicwear women",
    "Jeans men",
    "Jeans women",
    "Jackets men",
    "Jackets women",
    "T-shirts men",
    "T-shirts women",
    "Dresses women",
    "Blazers men",
    "Blazers women",
    "Sweaters men",
    "Sweaters women",
    "Swimwear men",
    "Swimwear women",
    "Hoodies men",
    "Hoodies women",
    "Activewear men",
    "Activewear women",
    "Skirts women",
    "Pants men",
    "Pants women",
    "Coats men",
    "Coats women",
    "Vests",
    "Cardigans",
    "Leggings women",
    "Formalwear men",
    "Formalwear women",
    "Tracksuits men",
    "Tracksuits women",
    "Scarves",
    "Gloves",
    "Hats",
    "Underwear men",
    "Underwear women",
    "Sleepwear",
    "Accessories",
    "Footwear men",
    "Footwear women",
    "Backpacks",
    "Sunglasses",
    "Jewelry",
    "Watches",
    "Ties",
    "Handbags",
    "Wallets",
    "Umbrellas",
    "Sweatpants men",
    "Leggings men",
    "Jumpsuits",
    "Rompers women",
    "Beanies",
    "Ponchos"
]


In [5]:
df = pd.DataFrame()
for i in tqdm(range(len(search_texts))):
    if(i == 0):
        df = get_results(search_texts[0])
    else:
        df_temp = get_results(search_texts[i])
        df = pd.concat([df, df_temp], ignore_index=True)

100%|██████████| 60/60 [16:35<00:00, 16.59s/it]


In [15]:
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,name,brand,price,rating,search_input
0,Men Pack of 3 Solid Ankle-Length Socks,Jockey,389,4.4,Socks
1,Men 3 Ankle-Length Socks,ADIDAS,395,4.4,Socks
2,Pack Of 5 Ankle Length Socks,HRX by Hrithik Roshan,406,4.4,Socks
3,Sport Men Set of 3 Crew Socks,Jockey,479,4.5,Socks
4,Men Pack 3 Above Ankle Socks,HRX by Hrithik Roshan,232,4.5,Socks
...,...,...,...,...,...
13170,Self Design Poncho,Soch,899,4.3,Ponchos
13171,Women Green Self Design Poncho,JoE Hazel,1133,,Ponchos
13172,Women Cable Knit Poncho,JoE Hazel,1382,3.9,Ponchos
13173,Women Fair Isle Poncho,JoE Hazel,1133,4.5,Ponchos


In [16]:
df.to_csv("./data/raw_data.csv", index=False)