In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import pandas as pd

# URL of the website
url = "https://www.flipkart.com/search?q=google+mobiles&as=on&as-show=on&otracker=AS_Query_HistoryAutoSuggest_1_14_na_na_na&otracker1=AS_Query_HistoryAutoSuggest_1_14_na_na_na&as-pos=1&as-type=HISTORY&suggestionId=google+mobiles&requestId=2eba807a-21e6-40ae-b06d-0165df0e72b4&as-searchtext=google+mobiles&p%5B%5D=facets.price_range.from%3D20000&p%5B%5D=facets.price_range.to%3DMax"

def initialize_driver():
    driver = webdriver.Chrome()  # Ensure ChromeDriver is in your PATH
    driver.maximize_window()
    return driver

def load_page(driver, url):
    driver.get(url)
    time.sleep(5)  # Wait for the page to load

# Function to scrape product names, links, and prices
def scrape_product_data(driver):
    product_names = [name.text for name in driver.find_elements(By.CLASS_NAME, 'KzDlHZ')]
    product_links = [link.get_attribute('href') for link in driver.find_elements(By.XPATH, '//a[@class="CGtC98"]')]
    product_prices = [price.text for price in driver.find_elements(By.CLASS_NAME, 'hl05eU')]  # Adjusted class name for prices
    
    # Return product data
    return product_names, product_links, product_prices

# Function to scrape multiple pages
def scrape_multiple_pages(driver, base_url, num_pages):
    all_product_names = []
    all_product_links = []
    all_product_prices = []
    
    for page in range(1, num_pages + 1):
        load_page(driver, f"{base_url}&page={page}")  # Update the URL to include the page number
        product_names, product_links, product_prices = scrape_product_data(driver)
        
        # Ensure the lists are of the same length before extending
        if len(product_names) == len(product_links) == len(product_prices):
            all_product_names.extend(product_names)
            all_product_links.extend(product_links)
            all_product_prices.extend(product_prices)
        else:
            print(f"Warning: Mismatched data on page {page}. Names: {len(product_names)}, Links: {len(product_links)}, Prices: {len(product_prices)}")

        time.sleep(5)  # Wait before loading the next page
    
    return all_product_names, all_product_links, all_product_prices

# Initialize WebDriver and scrape multiple pages
driver = initialize_driver()
all_product_names, all_product_links, all_product_prices = scrape_multiple_pages(driver, url, 2)  # Adjust number of pages as needed

# Close the driver
driver.quit()

# Create a DataFrame to store the results
df = pd.DataFrame({
    'Product_Name': all_product_names,
    'Product_Link': all_product_links,
    'Product_Price': all_product_prices  # Updated to include product prices
})

# Display or save the scraped data
df.head()  # Display the DataFrame
Output_path = "D:\\Project\\Flipkart SA\\flipkart_scrape_google.csv"
df.to_csv(Output_path, index=False)
#df.to_csv('flipkart_scrape_redmi.csv', index=False)  # Save the scraped data to a CSV file

In [11]:
df = pd.read_csv("D:\\Project\\Flipkart SA\\flipkart_scrape_google.csv")

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Product_Name   48 non-null     object
 1   Product_Link   48 non-null     object
 2   Product_Price  48 non-null     object
dtypes: object(3)
memory usage: 1.3+ KB


In [15]:
import re

# Function to extract and clean the price (remove rupee symbol and commas)
def extract_clean_price(price_string):
    # Find the rupee symbol followed by the price
    match = re.search(r'₹(\d[\d,]*)', price_string)
    if match:
        # Remove the rupee symbol and commas, and convert to an integer
        return int(match.group(1).replace(',', ''))
    return None

# Apply the function to the 'Price' column
df['Product_Price'] = df['Product_Price'].apply(extract_clean_price)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Product_Name   48 non-null     object
 1   Product_Link   48 non-null     object
 2   Product_Price  48 non-null     int64 
dtypes: int64(1), object(2)
memory usage: 1.3+ KB


In [19]:
df = df[df['Product_Price'] <= 40000]

In [21]:
pd.set_option("display.max_rows", None)

In [23]:
df

Unnamed: 0,Product_Name,Product_Link,Product_Price
0,"Google Pixel 7a (Charcoal, 128 GB)",https://www.flipkart.com/google-pixel-7a-charc...,31999
1,"Google Pixel 7a (Snow, 128 GB)",https://www.flipkart.com/google-pixel-7a-snow-...,31999
2,"Google Pixel 7a (Coral, 128 GB)",https://www.flipkart.com/google-pixel-7a-coral...,31999
3,"Google Pixel 8 (Mint, 128 GB)",https://www.flipkart.com/google-pixel-8-mint-1...,37999
4,"Google Pixel 8 (Hazel, 128 GB)",https://www.flipkart.com/google-pixel-8-hazel-...,37999
5,"Google Pixel 8 (Obsidian, 128 GB)",https://www.flipkart.com/google-pixel-8-obsidi...,37999
23,"Google Pixel 7 (Obsidian, 128 GB)",https://www.flipkart.com/google-pixel-7-obsidi...,36999
24,"Google Pixel 7 (Lemongrass, 128 GB)",https://www.flipkart.com/google-pixel-7-lemong...,36999
28,"Google Pixel 7 (Snow, 128 GB)",https://www.flipkart.com/google-pixel-7-snow-1...,36999
29,"Google Pixel 8 (Rose, 128 GB)",https://www.flipkart.com/google-pixel-8-rose-1...,37999


In [25]:
products_to_remove = ['(Refurbished) Google Pixel 7 (Snow, 128 GB)', '(Refurbished) Google Pixel 7a (Snow, 128 GB)', 
                      '(Refurbished) Google Pixel 7a (Charcoal, 128 GB)', '(Refurbished) Google Pixel 7a (Sea, 128 GB)',
                     '(Refurbished) Google Pixel 4a (Just Black, 128 GB)', '(Refurbished) Google Pixel 7 (Obsidian, 128 GB)', 
                     '(Refurbished) Google Pixel 7 (Lemongrass, 128 GB)']
# Drop rows where Product_Name is in the list of products to remove
df = df[~df['Product_Name'].isin(products_to_remove)]

In [27]:
df.shape

(10, 3)

In [29]:
Output_path = "D:\\Project\\Flipkart SA\\flipkart_cleaned_google.csv"
df.to_csv(Output_path, index=False)

In [None]:
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException

# Load the CSV file containing product links
file_path = r"D:\Project\Flipkart SA\flipkart_cleaned_google.csv"
df = pd.read_csv(file_path)

# Initialize the Chrome driver
driver = webdriver.Chrome()

# Function to extract reviews and ratings from a product page
def extract_reviews_and_ratings(driver):
    reviews = []
    ratings = []

    # Wait for the reviews section to load
    wait = WebDriverWait(driver, 10)
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'ZmyHeo')))

    # Extract reviews
    review_elements = driver.find_elements(By.CLASS_NAME, "ZmyHeo")
    for element in review_elements:
        try:
            # Click "Read More" if available
            read_more = element.find_elements(By.CLASS_NAME, "b4x-fr")
            if read_more:
                driver.execute_script("arguments[0].click();", read_more[0])
                time.sleep(1)  # Wait for the full review to load
            
            reviews.append(element.text)
        except StaleElementReferenceException:
            continue

    # Extract star ratings
    rating_elements = driver.find_elements(By.CLASS_NAME, "XQDdHH.Ga3i8K")
    for i in range(max(len(reviews), len(rating_elements))):
        # Append ratings or None if not available
        if i < len(rating_elements):
            ratings.append(rating_elements[i].text)
        else:
            ratings.append(None)  # Placeholder for missing ratings

    return reviews, ratings

# Function to load the page with the correct page number in the URL
def load_page(driver, url):
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'ZmyHeo')))

# Scrape reviews and ratings for all product links
all_data = []

num_pages_reviews = 20  # Number of review pages to scrape per product

# Loop through each product link in the DataFrame
for index, product_link in df['Product_Link'].items():
    print(f"Scraping product {index + 1}/{len(df)}: {product_link}")
    
    # Scrape reviews for the current product
    driver.get(product_link)
    time.sleep(2)  # Wait for the product page to load

    # Click on the 'All Reviews' button if it exists
    try:
        wait = WebDriverWait(driver, 10)
        all_reviews_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, '_23J90q.RcXBOT')))
        all_reviews_button.click()
        time.sleep(2)  # Wait for the reviews page to load
    except Exception as e:
        print(f"Could not find 'All Reviews' button for {product_link}: {e}")
        continue  # Skip to the next product if no reviews found

    # Scrape reviews and ratings across multiple review pages
    for page in range(1, num_pages_reviews + 1):
        if page > 1:
            # Update the URL to navigate to the next page of reviews
            page_url = f"{driver.current_url}&page={page}"
            load_page(driver, page_url)

        reviews, ratings = extract_reviews_and_ratings(driver)
        
        # Check if reviews are empty, and if so, stop scraping further pages
        if not reviews:
            print(f"No more reviews found on page {page}.")
            break

        # Append reviews and ratings to the all_data list along with the product link
        for review, rating in zip(reviews, ratings):
            all_data.append({
                'Product_Link': product_link,
                'Review': review,
                'Rating': rating
            })

        time.sleep(5)  # Wait before loading the next reviews page

# Close the driver after scraping is complete
driver.quit()

# Convert the collected data into a DataFrame and save it as a CSV file
result_df = pd.DataFrame(all_data)

# Save to CSV
output_file = r"D:\Project\Flipkart SA\google_reviews_ratings_all_products.csv"
result_df.to_csv(output_file, index=False)

print(f"Scraping complete. Data saved to {output_file}")

In [None]:
import pandas as pd
df = pd.read_csv("D:\\Project\\Flipkart SA\\google_reviews_ratings_all_products.csv")

In [None]:
df1 = pd.read_csv("D:\\Project\\Flipkart SA\\flipkart_cleaned_google.csv")

In [None]:
df = pd.merge(df, df1, on = 'Product_Link', how='inner')

In [None]:
output_file = r"D:\Project\Flipkart SA\google_merged.csv"
df.to_csv(output_file, index=False)