# Myntra T-Shirt Review Data Harvesting & Product Details Application

In [1]:
# Importing necessary libraries for web scraping and data manipulation
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
import numpy as np 

In [2]:
# Global variable to keep track of file names
file_name_counter = 0

# Function for scraping Myntra data
def myntra_data_scrap(link):

    global file_name_counter
    file_name_counter += 1
    
    # Create a new instance of the Edge driver
    driver = webdriver.Edge()

    # Dictionary to store product information
    Product_Information = {'Product_link' :[],
                            'Product_id': [],
                            'Product_Name':[],
                            'Product_Description':[],
                            'Discounted_Price':[],
                            'Original_Price':[],
                            'Discount_Percentage':[],
                            'Fabric_Material':[],
                            'Neck_Type':[],
                            'overall_rating':[],
                            'votes':[],
                            'Customer_Rated_Rating':[],
                            'Customer_Review_Text':[],
                            'Reviewers_Username':[],
                            'Date_of_Review':[],
                            'Reviewers_Product_Images':[]
                            }

    # Loop through the provided links
    for i in link:
        driver.get(i)

        # Wait for 3 seconds to let the page load
        time.sleep(3)
        
        html_content = driver.page_source

        # Parse the HTML content with BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')

               
        # get the product name
        try:
            brand = soup.find('h1', attrs= {'class': 'pdp-title'}).text
            
        except:
            brand = np.NaN 
        
        # Get the description    
        try:
            description = soup.find('h1', attrs= {'class': 'pdp-name'}).text
            
        except:
            description = np.NaN 

        # Get the overall-rating and votes
        try:
            rating = soup.find('div', attrs={'class': 'index-overallRating'}).text
            overall_rating = rating[:3]
            votes = rating[4:].rstrip('Ratings ')
        except:
            overall_rating = np.NaN
            votes = np.NaN

        # Get the discount price
        try:
            discount_price = soup.find('span', attrs= {'class': 'pdp-price'}).text
            
        except:
            discount_price = np.NaN 

        # Get the actual price    
        try:
            actual_price = soup.find('span', attrs= {'class': 'pdp-mrp'}).text
            
        except:
            actual_price = np.NaN 

        # Get the discount percentage    
        try:
            discount_percentage = soup.find('span', attrs= {'class': 'pdp-discount'}).text
            
        except:
            discount_percentage = np.NaN 
        
        # Get the product specification 
        try:
            spec = soup.find('div', {'class': 'index-tableContainer'})

            specification = {}
            for row in spec:
                key = row.find('div', attrs={'class':'index-rowKey'}).text.strip()
                value = row.find('div', attrs={'class':'index-rowValue'}).text.strip()
                specification[key] = value
            if specification:
                fabric = specification['Fabric']
                neck = specification['Neck']
            else: 
                fabric = np.NaN
                neck = np.NaN
        except:
            specification = {'Fabric': np.NaN,
                            'Neck': np.NaN
                            }

        
        id = soup.find('span', attrs={'class':"supplier-styleId"}).text
        driver.get('https://www.myntra.com/reviews/'+id)
        

        # Wait for the page to load
        wait = WebDriverWait(driver, 3)
        try:    
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.user-review-reviewTextWrapper')))


            customer_review_image = []

            try:
                
                # Get the reviewer images
                customer_image = driver.find_elements(By.CSS_SELECTOR, '.image-thumb-wrapper-image')
                for image in customer_image:
                    photo = image.get_attribute('src')
                    if photo:
                        customer_review_image.append(photo)
            except: 
                customer_review_image = np.NaN


            # Get the initial reviews
            review_text = []
            customer_rating = []
            reviewer_name = []
            review_date = []
                
            # Scroll down to load more reviews
            body = driver.find_element(By.TAG_NAME, 'body')
            for _ in range(35):  # Adjust the range based on the number of scrolls you want
                body.send_keys(Keys.PAGE_DOWN)
                time.sleep(1)

            # Wait for the new reviews to load
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.user-review-reviewTextWrapper')))

            try:    
                # Get the reviewer name and date
                reviews = driver.find_elements(By.CSS_SELECTOR, '.user-review-reviewTextWrapper')
                for review in reviews:
                    review_text.append(review.text.replace('\n', ''))
            except:
                review_text = np.NaN

            try:
                rating = driver.find_elements(By.CSS_SELECTOR, '.user-review-starWrapper')
                #for rate in rating:
                for rate in rating:
                    spans = rate.find_elements(By.TAG_NAME, 'span')
                    if spans:
                        reviewer_rate = spans[0].text
                        customer_rating.append(reviewer_rate)
            except:
                customer_rating = np.NaN

            try:
                comment_name = driver.find_elements(By.CSS_SELECTOR, '.user-review-left')
                for name in comment_name:
                    # Extracting text content of each span within .user-review-left
                    spans = name.find_elements(By.TAG_NAME, 'span')
                    if spans:
                        commenter_name = spans[0].text
                        review_dates = spans[1].text
                        reviewer_name.append(commenter_name)
                        review_date.append(review_dates)
            except:
                review_date = np.NaN
                reviewer_name = np.NaN
                
        except:
            customer_review_image = np.NaN
            review_text = np.NaN
            customer_rating = np.NaN
            reviewer_name = np.NaN
            review_date = np.NaN
            
  
        Product_Information['Product_link'].append(i)
        Product_Information['Product_id'].append(id)
        Product_Information['Product_Name'].append(brand)
        Product_Information['Product_Description'].append(description)
        Product_Information['Discounted_Price'].append(discount_price)
        Product_Information['Original_Price'].append(actual_price)
        Product_Information['Discount_Percentage'].append(discount_percentage)
        Product_Information['Fabric_Material'].append(fabric)
        Product_Information['Neck_Type'].append(neck)
        Product_Information['overall_rating'].append(overall_rating)
        Product_Information['votes'].append(votes)
        Product_Information['Customer_Rated_Rating'].append(customer_rating)
        Product_Information['Customer_Review_Text'].append(review_text)
        Product_Information['Reviewers_Username'].append(reviewer_name)
        Product_Information['Date_of_Review'].append(review_date)
        Product_Information['Reviewers_Product_Images'].append(customer_review_image)

    df = pd.DataFrame(Product_Information)
        
    file_name = f'mynthra_tshirt_data_{file_name_counter}.csv'

    df.to_csv(file_name, index=False, encoding='utf-8-sig')

    # Close the browser
    driver.quit()

    return print(file_name,'Completed')


## Data Harvesting with user defined Batch size  

In [5]:
def create_batches(link_list, batch_size, current_batch=1):
    # Calculate the start and end indices for the current batch
    start_index = (current_batch - 1) * batch_size
    end_index = current_batch * batch_size

    # Extract the links for the current batch
    current_batch_links = link_list[start_index:end_index]
    myntra_data_scrap(current_batch_links)

    # If there are more links, recursively call the function for the next batch
    if end_index < len(link_list):
        create_batches(link_list, batch_size, current_batch + 1)
        



## Read the dataset with weblinks

In [3]:
df_link = pd.read_csv('mynthra_tshirt_link_full.csv')
df_link.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121197 entries, 0 to 121196
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   links   121197 non-null  object
dtypes: object(1)
memory usage: 947.0+ KB


In [None]:
# Creating a list from the dataframe
tshirt_link = df_link['links'].tolist()

# Mention the  batch size 
batch_size = 1000
create_batches(tshirt_link_list, batch_size)
