In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import time
import logging
from requests.exceptions import RequestException

# Configure logging
logging.basicConfig(filename='scraper.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Function to extract Product Title
def get_title(soup):
    try:
        title = soup.find("a", class_='wjcEIp')
        title_string = title.text.strip() if title else "Title Not Found"
    except AttributeError:
        title_string = "Title Not Found"
    return title_string

# Function to extract Product Price
def get_price(soup):
    try:
        price = soup.find("div", class_='Nx9bqj')
        price_string = price.text.strip() if price else "Price Not Found"
    except AttributeError:
        price_string = "Price Not Found"
    return price_string

# Function to extract Product Rating
def get_rating(soup):
    try:
        rating = soup.find("div", class_='XQDdHH')
        rating_string = rating.text.strip() if rating else "Rating Not Found"
    except AttributeError:
        rating_string = "Rating Not Found"
    return rating_string

# Function to extract Number of User Reviews
def get_review_count(soup):
    try:
        review_count = soup.find("span", class_='Wphh3N').text.strip()
    except AttributeError:
        review_count = "Review Count Not Found"
    return review_count

# Function to handle HTTP requests with retries
def fetch_url(url, headers, retries=3):
    for attempt in range(retries):
        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()  # Raise HTTPError for bad responses
            return response
        except RequestException as e:
            logging.error(f"Request failed for URL {url} on attempt {attempt+1}: {e}")
            time.sleep(2)  # Wait before retrying
    logging.error(f"All retries failed for URL: {url}")
    return None

# Function to extract data from a single page
def extract_data_from_page(url, headers):
    page_data = {"title": [], "price": [], "rating": [], "reviews": []}
    
    webpage = fetch_url(url, headers)
    if not webpage:
        logging.error(f"Failed to fetch the page URL: {url}")
        return page_data
    
    soup = BeautifulSoup(webpage.content, "html.parser")
    
    # Fetch product links
    links = soup.find_all("a", class_='VJA3rP')
    links_list = [link.get('href') for link in links if link.get('href')]
    
    for link in links_list:
        new_webpage = fetch_url("https://www.flipkart.com" + link, headers)
        if not new_webpage:
            logging.error(f"Failed to fetch product page for link: {link}")
            continue
        
        new_soup = BeautifulSoup(new_webpage.content, "html.parser")

        page_data['title'].append(get_title(new_soup))
        page_data['price'].append(get_price(new_soup))
        page_data['rating'].append(get_rating(new_soup))
        page_data['reviews'].append(get_review_count(new_soup))

    return page_data

if __name__ == '__main__':
    HEADERS = {'User-Agent': 'Your_User_Agent_Here', 'Accept-Language': 'en-US, en;q=0.5'}
    
    # List of the first 4 pagination links
    page_links = [
        "https://www.flipkart.com/search?q=Samsung+&otracker-search&otracker1=search&marketplace-FLIPKART&as-show-on&as=off&page=1",
        "https://www.flipkart.com/search?q=Samsung+&otracker-search&otracker1-search&marketplace-FLIPKART&as-show=on&as-off&page=2",
        "https://www.flipkart.com/search?q=Samsung+&otracker-search&otracker1-search&marketplace-FLIPKART&as-show=on&as-off&page=3",
        "https://www.flipkart.com/search?q=Samsung+&otracker-search&otracker1-search&marketplace-FLIPKART&as-show=on&as=off&page=4"
    ]

    all_data = {"title": [], "price": [], "rating": [], "reviews": []}

    for page_url in page_links:
        print(f"Scraping URL: {page_url}...")
        page_data = extract_data_from_page(page_url, HEADERS)
        
        for key in all_data.keys():
            all_data[key].extend(page_data[key])

        # Delay to prevent rate limiting
        time.sleep(5)  # Adjust this delay as needed

    # Create DataFrame and save to CSV
    flipkart_df = pd.DataFrame.from_dict(all_data)

    # Debug: Print number of rows in DataFrame
    print("Number of rows in DataFrame:", len(flipkart_df))

    flipkart_df['title'].replace('', np.nan, inplace=True)
    flipkart_df = flipkart_df.dropna(subset=['title'])
    flipkart_df.to_csv("flipkart_data.csv", header=True, index=False)

    print('Data has been successfully written to flipkart_data.csv')
