## webscrapping

In [14]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [15]:
# URL of the flipcrt search results page
url = 'https://www.flipkart.com/search?q=nokia+smartphone&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&sort=recency_desc'

In [16]:
def switch_tab(source,tab_index):
    link = source.get('href') #getting the URL to open in a new tab.,with html attribut href which specifies destination of link included in <a> tag
    driver.execute_script(f'window.open("{link}", "_blank");') #opening new tab with URL from previous step.

    # Wait for the new window to open and then switch to it
    WebDriverWait(driver, 10).until(lambda d: len(d.window_handles) > tab_index)
    driver.switch_to.window(driver.window_handles[tab_index])

    # Wait for the new tab to load its content,until HTML element with the class name 'fonts-loaded' is present on the page.
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, 'fonts-loaded'))
    )

    # Getting the HTML content of the new tab(current page)
    new_tab_content = driver.page_source
    soup = BeautifulSoup(new_tab_content, 'html.parser')#creating a BeautifulSoup object which can be used to parse the HTML and extract data from it

    return soup

In [17]:
# Set up the Selenium WebDriver
driver = webdriver.Chrome()
driver.maximize_window()

# Load the page
driver.get(url)

# Wait for the page to fully load
time.sleep(5)

# Get the HTML content of the page
html_content = driver.page_source

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Find elements containing product information
products = soup.find_all('a', rel='noopener noreferrer', limit=20)

data = []


for product in products:
    product_details = {} #creating an empty dictionary to store the product details for each product.
    name = product.find('div', class_='_4rR01T')
    name = name.text if name else "No name available"

    # Extract the price of the product
    price = product.find('div', class_='_30jeq3 _1_WHN1')
    price = price.text if price else "No price available"

    # Extract the rating of the product
    rating = product.find('div', class_='_3LWZlK')
    rating = rating.text if rating else "No rating available"

#Extract number of ratings and reviews
    num_ratings_reviews_container = product.find('span', class_='_2_R_DZ')
    if num_ratings_reviews_container:
        ratings_reviews_text = num_ratings_reviews_container.get_text().strip()
        
        # Split the text to isolate ratings and reviews
        ratings_text, reviews_text = ratings_reviews_text.split('&')
        
        # Extract just the numbers
        num_ratings = ''.join(filter(str.isdigit, ratings_text))
        num_reviews = ''.join(filter(str.isdigit, reviews_text))
    else:
        num_ratings = "No ratings"
        num_reviews = "No reviews"


    
    product_details['Product Name'] = name  #storing the extracted details in the product_details dictionary
    product_details['Price'] = price
    product_details['Rating'] = rating
    product_details['Number of Ratings'] = num_ratings
    product_details['Number of Reviews'] = num_reviews
    if num_reviews != "No reviews" and num_reviews != "0":
        soup = switch_tab(product,1)     #If the product has reviews, it switches to the reviews tab

        # Find the element with the reviews or link to reviews
        div_element = soup.find('div', class_='col JOpGWq')

        if div_element:
            # Find all <a> tags within the div
            all_a_tags = div_element.find_all('a')
            # Check if there are any <a> tags
            if len(all_a_tags) > 10:
                # Get the last <a> tag
                last_a_tag = all_a_tags[-1]

                soup = switch_tab(last_a_tag,2) # finding the container with the reviews, extracts the summary and details of each review, and stores them in the product_details dictionary.
                #If there are more than 10 reviews, it opens another tab to the page with the remaining reviews and extracts those as well.

                reviews_container = soup.find_all('div',class_='col _2wzgFH K0kLPL')
                review_summary_list = []
                review_details_list = []
                for review in reviews_container:
                    review_summary = review.find('p',class_='_2-N8zT').text if review.find('p',class_='_2-N8zT') else "No summary"
                    review_details = review.find('div',class_='t-ZTKy').text if review.find('div',class_='t-ZTKy') else "No details"
                    review_summary_list.append(review_summary)
                    review_details_list.append(review_details)
                    # print(f"{reviews_container.index(review)}. review_summary: {review_summary}")
                    # print(f"{reviews_container.index(review)}. review_details: {review_details}")
                product_details['Review Summary'] = review_summary_list
                product_details['Review Details'] = review_details_list
                driver.close()
            else:
                review_summary_list = []
                review_details_list = []
                reviews_summary_container = soup.find_all('p',class_='_2-N8zT')
                for reviews_summary in reviews_summary_container:
                    reviews_summary = reviews_summary.text
                    review_summary_list.append(reviews_summary)
                #     # print(f"{reviews_summary_container.index(reviews_summary)}. review_summary: {review_summary}")
                reviews_details_container = soup.find_all('div',class_='t-ZTKy')
                for review_details in reviews_details_container:
                    # print(reviews_details_container.index(review_details))
                    review_details = review_details.text
                    review_details_list.append(review_details)
                    # print(f". review_details: {review_details}")
                product_details['Review Summary'] = review_summary_list
                product_details['Review Details'] = review_details_list

            # print('-' *50)
            
            driver.switch_to.window(driver.window_handles[1])
            driver.close()    #Close the reviews tab
        driver.switch_to.window(driver.window_handles[0])
    data.append(product_details) #It adds the product_details dictionary to the data list.

# Close the browser
driver.quit()

In [18]:
data

[{'Product Name': 'Nokia C32 (Charcoal, 128 GB)',
  'Price': '₹7,999',
  'Rating': '4',
  'Number of Ratings': '106',
  'Number of Reviews': '15',
  'Review Summary': ['Best in the market!',
   'Good quality product',
   'Worth every penny',
   'Absolute rubbish!',
   'Brilliant',
   'Not recommended at all',
   'Does the job',
   'Nice',
   'Terrific purchase',
   'Worthless'],
  'Review Details': ['Super 👍 mobile haiREAD MORE',
   'I  LIKE IT.READ MORE',
   'Excellent mobileREAD MORE',
   'Phone keeps hanging every now and then and also the battery gets discharged very quickly..very worse and disappointed purchase...READ MORE',
   'Nice 👍READ MORE',
   'Waste productREAD MORE',
   'Average phone,  very slowREAD MORE',
   'Poor performance. Hangs oftenREAD MORE',
   'Nice productREAD MORE',
   'Battery charge down quicklyREAD MORE']},
 {'Product Name': 'Nokia C32 (Beach Pink, 128 GB)',
  'Price': '₹8,740',
  'Rating': '4',
  'Number of Ratings': '106',
  'Number of Reviews': '15',
  '

In [20]:
df = pd.DataFrame(data)
df

Unnamed: 0,Product Name,Price,Rating,Number of Ratings,Number of Reviews,Review Summary,Review Details
0,"Nokia C32 (Charcoal, 128 GB)","₹7,999",4.0,106,15,"[Best in the market!, Good quality product, Wo...","[Super 👍 mobile haiREAD MORE, I LIKE IT.READ ..."
1,"Nokia C32 (Beach Pink, 128 GB)","₹8,740",4.0,106,15,"[Best in the market!, Good quality product, Wo...","[Super 👍 mobile haiREAD MORE, I LIKE IT.READ ..."
2,"Nokia C32 (Breezy Mint, 128 GB)","₹7,999",4.0,106,15,"[Best in the market!, Good quality product, Wo...","[Super 👍 mobile haiREAD MORE, I LIKE IT.READ ..."
3,"Nokia G42 5G (So Grey, 256 GB)","₹16,890",4.1,228,38,"[Nice product, Value-for-money, Moderate, Terr...","[GoodREAD MORE, quality phoneREAD MORE, China ..."
4,"Nokia G42 5G (So Pink, 256 GB)","₹17,499",4.1,228,38,"[Nice product, Value-for-money, Moderate, Terr...","[GoodREAD MORE, quality phoneREAD MORE, China ..."
5,Nokia C22 Dual Sim with Jelly Case | 6.51 inch...,"₹8,099",3.4,84,9,"[Did not meet expectations, Highly recommended...",[The phone i gifted to my uncle in his birthda...
6,Nokia C22 Dual Sim with Jelly Case | 6.51 inch...,"₹7,749",3.4,84,9,"[Did not meet expectations, Highly recommended...",[The phone i gifted to my uncle in his birthda...
7,"Nokia G11 Plus TA-1438 DS (Grey, 64 GB)","₹7,555",3.7,115,9,"[Classy product, Must buy!, Absolute rubbish!,...",[Really good phone.. I wonder y some people ru...
8,Nokia C22 Dual Sim with Jelly Case | 6.51 inch...,"₹7,749",3.4,84,9,"[Did not meet expectations, Highly recommended...",[The phone i gifted to my uncle in his birthda...
9,Nokia C12 Pro Dual Sim | 6.3 inch Display | 40...,"₹7,499",4.0,30,1,[Terrific purchase],[Absolutely fine product 🔥READ MORE]


In [25]:
df.to_csv(r'C:\Users\acer\Desktop\ds class\nlp\sentiment analysis project amazon\scraped_data1.csv', index=False, encoding='utf-8-sig')