In [1]:
import time
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Function to scrape reviews for a given mobile brand and item
def scrape_reviews(brand_xpath):
    # Open Flipkart URL
    driver.get(url)

    # Close the login popup if it appears
    try:
        close_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(text(),'✕')]"))
        )
        close_button.click()
    except Exception:
        pass  # Login popup not found or already closed.

    # Locate the filter section
    try:
        filter_section = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//*[@id="container"]/div/div[3]/div[1]/div[1]'))
        )
    except Exception:
        pass  # Filter section not found.

    # Click on the mobile brand
    try:
        # Wait for the mobile brand element to be located
        mobile_brand = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, brand_xpath))
        )
        mobile_brand.click()
        time.sleep(10)  # Wait for the page to reload if the click is successful
    except Exception:
        return  # Exit the function if we cannot click the brand

    # Click the minimum price dropdown
    try:
        min_price = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, '//*[@id="container"]/div/div[3]/div[1]/div[1]/div/div[1]/div/section[2]/div[4]/div[1]'))
        )
        min_price.click()

        # Select the 20,000 option from the dropdown
        min_price_option = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, '//*[@id="container"]/div/div[3]/div/div[1]/div/div[1]/div/section[2]/div[4]/div[1]/select/option[4]'))
        )
        min_price_option.click()
        time.sleep(1)  # Allow time for the selection to process
    except Exception:
        pass  # Could not select minimum price.

    # Scroll down to make sure the brand element is visible
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(1)  # Allow time for the page to load

    # Scroll back up to the top
    driver.execute_script("window.scrollTo(0, 0);")
    time.sleep(1)  # Allow time for the scroll action to complete
    
    # Initialize an empty list to store product details
    product_details = []

    # Scrape product links, names, and prices from the current page and iterate through pagination
    while True:
        time.sleep(3)  # Allow time for page to load

        # Scrape phone links, names, and prices
        try:
            # Locate the div elements that contain product links
            product_divs = driver.find_elements(By.CSS_SELECTOR, "div._75nlfW")
            product_names = driver.find_elements(By.CSS_SELECTOR, "div.KzDlHZ")
            product_prices = driver.find_elements(By.CSS_SELECTOR, "div.Nx9bqj._4b5DiR")

            # Iterate over product divs and scrape details
            for i in range(len(product_divs)):
                try:
                    # Get the product link
                    link = product_divs[i].find_element(By.TAG_NAME, "a").get_attribute('href')

                    # Get the product name
                    name = product_names[i].text if i < len(product_names) else "N/A"

                    # Get the product price
                    price = product_prices[i].text if i < len(product_prices) else "N/A"

                    # Print scraped details
                    print(f'Product Link: {link}, Product Name: {name}, Price: {price}')

                    # Append the details to the product list
                    product_details.append([link, name, price])

                except Exception:
                    pass  # Error extracting product details, skip this product

            # Click the 'Next' button to go to the next page, if available
            try:
                next_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, "//span[contains(text(),'Next')]"))
                )
                driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
                time.sleep(1)
                next_button.click()
                time.sleep(3)  # Wait for the next page to load
            except Exception:
                break  # No more pages or an error occurred

        except Exception:
            break  # Could not find phones or scrape links

    return product_details  # Return the scraped product details


# Initialize Chrome driver
driver = webdriver.Chrome()

# Open Flipkart URL
url = "https://www.flipkart.com/search?q=mobile+phones+&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&as-pos=1&as-type=HISTORY"
driver.get(url)

# List of mobile brands and their respective XPaths
brands = {
    "Samsung": {
        "brand_xpath": '//*[@id="container"]/div/div[3]/div/div[1]/div/div[1]/div/section[3]/div[2]/div[1]/div[3]',
    },
    
    "Google": {
        "brand_xpath": '//*[@id="container"]/div/div[3]/div/div[1]/div/div[1]/div/section[3]/div[2]/div[1]/div[4]',
    },

    "Motorola": {
        "brand_xpath": '//*[@id="container"]/div/div[3]/div/div[1]/div/div[1]/div/section[3]/div[2]/div[1]/div[5]',
    },

    "Vivo": {
        "brand_xpath": '//*[@id="container"]/div/div[3]/div/div[1]/div/div[1]/div/section[3]/div[2]/div[1]/div[6]',
    },

    "Oppo": {
        "brand_xpath": '//*[@id="container"]/div/div[3]/div/div[1]/div/div[1]/div/section[3]/div[2]/div[1]/div[7]',
    }
}

# Prepare the CSV file for writing data
csv_file_path = "/Users/praveenkumarm/Desktop/Guvi_Projects/Ecom_Recommandation/Phone_Links.csv"
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Product Link", "Product Name", "Price"])  # Write headers
    
    # Iterate over each brand and scrape reviews
    for brand, xpaths in brands.items():
        print(f"Scraping reviews for {brand}...")
        product_details = scrape_reviews(xpaths["brand_xpath"])
        writer.writerows(product_details)  # Write all product details to the CSV after scraping each brand

# Close the browser
driver.quit()


Scraping reviews for Samsung...
Product Link: https://www.flipkart.com/samsung-galaxy-s23-5g-phantom-black-128-gb/p/itm1f3efe01d1c61?pid=MOBGNPGZVX4PCTTF&lid=LSTMOBGNPGZVX4PCTTFYWYWBL&marketplace=FLIPKART&q=mobile+phones+&store=tyy%2F4io&srno=s_1_1&otracker=search&otracker1=search&fm=Search&iid=a1941093-33ff-4af1-80ca-84f74ec1b3c5.MOBGNPGZVX4PCTTF.SEARCH&ppt=sp&ppn=sp&ssid=ehjzd11qv40000001730808629076&qH=f696c2fbb0b173a0, Product Name: SAMSUNG Galaxy S23 5G (Phantom Black, 128 GB), Price: ₹39,999
Product Link: https://www.flipkart.com/samsung-galaxy-s23-fe-graphite-256-gb/p/itm3dd39c0d1ec9c?pid=MOBGVTA2GWPSR57X&lid=LSTMOBGVTA2GWPSR57XIIBGXC&marketplace=FLIPKART&q=mobile+phones+&store=tyy%2F4io&srno=s_1_2&otracker=search&otracker1=search&fm=Search&iid=a1941093-33ff-4af1-80ca-84f74ec1b3c5.MOBGVTA2GWPSR57X.SEARCH&ppt=sp&ppn=sp&ssid=ehjzd11qv40000001730808629076&qH=f696c2fbb0b173a0, Product Name: SAMSUNG Galaxy S23 FE (Graphite, 256 GB), Price: ₹35,999
Product Link: https://www.flipkart.c

In [2]:
import pandas as pd

# Load the dataset
file_path = "/Users/praveenkumarm/Desktop/Guvi_Projects/Ecom_Recommandation/Phone_Links.csv"
df = pd.read_csv(file_path)

# Display original data shape
print(f"Original dataset size: {df.shape}")

# Remove duplicates
df = df.drop_duplicates()

# Clean the 'Price' column by removing currency symbol '₹' and commas, and then convert to float
df['Price'] = df['Price'].str.replace('₹', '').str.replace(',', '').str.strip().astype(float)

# Remove rows with price greater than 40000
df = df[df['Price'] >= 35000]
df = df[df['Price'] <= 40000]

# Remove rows where the 'Product Link' contains the word 'refurbished'
df = df[~df['Product Link'].str.contains("refurbished", case=False)]

# Display cleaned data shape
print(f"Cleaned dataset size: {df.shape}")

# Save the cleaned dataset to a new CSV file
output_file_path = "/Users/praveenkumarm/Desktop/Guvi_Projects/Ecom_Recommandation/Cleaned_Phone_Links.csv"
df.to_csv(output_file_path, index=False)

print(f"Cleaned dataset saved to {output_file_path}")


Original dataset size: (923, 3)
Cleaned dataset size: (78, 3)
Cleaned dataset saved to /Users/praveenkumarm/Desktop/Guvi_Projects/Ecom_Recommandation/Cleaned_Phone_Links.csv


In [3]:
import time
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# Function to scrape product details
def scrape_product_details(product_link):
    # Load the product link in the current tab
    driver.get(product_link)  # Load the product link in the current tab
    driver.refresh()  # Refresh the page to ensure it loads properly

    # Get page source and parse with Beautiful Soup
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Scrape the product name
    try:
        product_name = soup.select_one("h1._6EBuvT").get_text(strip=True)
        print(f'Product Name: {product_name}')
    except Exception as e:
        print("Could not find the product name.")
        product_name = "N/A"

    # Scrape the product price
    try:
        price = soup.select_one("div.Nx9bqj.CxhGGd").get_text(strip=True)
        print(f'Product Price: {price}')
    except Exception as e:
        print("Could not find the product price.")
        price = "N/A"

    # Click the 'All Reviews' section
    try:
        all_reviews_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "div._23J90q.RcXBOT"))
        )
        all_reviews_button.click()
    except Exception as e:
        print("Could not find or click on the 'All Reviews' button.")
        return []

    # Wait for reviews to load
    time.sleep(5)  # Adjust this time if needed

    # List to store scraped review data
    review_data = []

    # Scrape ratings and reviews across multiple pages
    page_count = 0
    max_pages = 25  # Set a maximum number of pages to prevent infinite loops

    while page_count < max_pages:
        time.sleep(5)  # Allow time for reviews to load

        # Scrape ratings and reviews
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        ratings = soup.select("div.XQDdHH.Ga3i8K")
        reviews = driver.find_elements(By.CSS_SELECTOR, "div.ZmyHeo")

        if not ratings or not reviews:
            print("No ratings or reviews found. Exiting...")
            break

        # Iterate over each review and handle the 'Read More' button
        for rating, review in zip(ratings, reviews):
            try:
                # Check for the 'Read More' button
                read_more_button = review.find_element(By.CSS_SELECTOR, "span.b4x-fr")
                if read_more_button:
                    driver.execute_script("arguments[0].click();", read_more_button)
                    time.sleep(1)  # Allow time for the review to expand

                # Scrape the full review text and format it
                full_review_text = review.text.replace('\n', ' ').replace('\r', '').strip()
            except Exception:
                # If no 'Read More' button is found, just get the text directly and format it
                full_review_text = review.text.replace('\n', ' ').replace('\r', '').strip()

            print(f'Rating: {rating.text}, Review: {full_review_text}')
            review_data.append([product_link, product_name, price, rating.text, full_review_text])

        # Check for the next button and move to the next page if available
        try:
            next_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//span[contains(text(),'Next')]"))
            )
            driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
            time.sleep(1)  # Allow some time for scrolling
            driver.execute_script("arguments[0].click();", next_button)
            time.sleep(5)  # Wait for the next page to load
            page_count += 1  # Increment the page count
        except Exception as e:
            print("No more pages or an error occurred:", e)
            break

    return review_data  # Return the collected review data

# Initialize Chrome driver
driver = webdriver.Chrome()

# Read product links from the CSV file
input_csv_file_path = "/Users/praveenkumarm/Desktop/Guvi_Projects/Ecom_Recommandation/Cleaned_Phone_Links.csv"
product_links = []

with open(input_csv_file_path, mode='r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        product_links.append(row['Product Link'])  # Extracting the 'Product Link' column

# Prepare to save data into a new CSV file
output_csv_file_path = "/Users/praveenkumarm/Desktop/Guvi_Projects/Ecom_Recommandation/Mobile_Phone_Data.csv"
with open(output_csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write headers to the output CSV file
    writer.writerow(["Product Link", "Product Name", "Price", "Rating", "Review"])

# Loop through each product link and scrape details
for link in product_links:
    review_data = scrape_product_details(link)  # Call the scrape function
    if review_data:  # Only save if there's data to save
        # Save scraped data into the CSV file
        with open(output_csv_file_path, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerows(review_data)  # Write all review data for this product
    
    # Reuse the same tab for the next product
    driver.get("about:blank")  # Clear the page before loading the next URL

# Quit the browser after all reviews have been scraped
driver.quit()


Product Name: SAMSUNG Galaxy S23 5G (Phantom Black, 128 GB)  (8 GB RAM)
Product Price: ₹39,999
Rating: 5, Review: Samsung Galaxy s series great features at attractive price like always. pros. 1. Killer Look with handy weight balance 2. Super Smooth scrolling experience in display 3. Fast Fingerprint sensor 4. Audio quality super. 5. Battery hold almost for a day even in heavy use not gaming. 6. gaming experience very smooth without any lag. 7. Phone has Reverse Charging Features to charge other gadgets wirelessly 8. camera... that is the best part of s series , awesome like always .. 9. 5g connectivity.. 10 . last but not least.. samsung UI.. Gives it a smooth user experience.  Cons: 1. While gaming, it alot, i guess no cooling features in it. 2. Battery drain 1 % in every 2 minutes while gaming. 3. Mic quality need to be improved, there is sparkle voice of other person while on speakers 4. No charger, no back cover, i think when we buy expensive phone, we deserve both of them.. 5. res