In [30]:
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException

# Load the CSV file containing product links
file_path = r"D:\Project\Flipkart SA\flipkart_cleaned_google.csv"
df = pd.read_csv(file_path)

# Initialize the Chrome driver
driver = webdriver.Chrome()

# Function to extract reviews and ratings from a product page
def extract_reviews_and_ratings(driver):
    reviews = []
    ratings = []

    # Wait for the reviews section to load
    wait = WebDriverWait(driver, 10)
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'ZmyHeo')))

    # Extract reviews
    review_elements = driver.find_elements(By.CLASS_NAME, "ZmyHeo")
    for element in review_elements:
        try:
            # Click "Read More" if available
            read_more = element.find_elements(By.CLASS_NAME, "b4x-fr")
            if read_more:
                driver.execute_script("arguments[0].click();", read_more[0])
                time.sleep(1)  # Wait for the full review to load
            
            reviews.append(element.text)
        except StaleElementReferenceException:
            continue

    # Extract star ratings
    rating_elements = driver.find_elements(By.CLASS_NAME, "XQDdHH.Ga3i8K")
    for i in range(max(len(reviews), len(rating_elements))):
        # Append ratings or None if not available
        if i < len(rating_elements):
            ratings.append(rating_elements[i].text)
        else:
            ratings.append(None)  # Placeholder for missing ratings

    return reviews, ratings

# Function to load the page with the correct page number in the URL
def load_page(driver, url):
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'ZmyHeo')))

# Scrape reviews and ratings for all product links
all_data = []

num_pages_reviews = 20  # Number of review pages to scrape per product

# Loop through each product link in the DataFrame
for index, product_link in df['Product_Link'].items():
    print(f"Scraping product {index + 1}/{len(df)}: {product_link}")
    
    # Scrape reviews for the current product
    driver.get(product_link)
    time.sleep(2)  # Wait for the product page to load

    # Click on the 'All Reviews' button if it exists
    try:
        wait = WebDriverWait(driver, 10)
        all_reviews_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, '_23J90q.RcXBOT')))
        all_reviews_button.click()
        time.sleep(2)  # Wait for the reviews page to load
    except Exception as e:
        print(f"Could not find 'All Reviews' button for {product_link}: {e}")
        continue  # Skip to the next product if no reviews found

    # Scrape reviews and ratings across multiple review pages
    for page in range(1, num_pages_reviews + 1):
        if page > 1:
            # Update the URL to navigate to the next page of reviews
            page_url = f"{driver.current_url}&page={page}"
            load_page(driver, page_url)

        reviews, ratings = extract_reviews_and_ratings(driver)
        
        # Check if reviews are empty, and if so, stop scraping further pages
        if not reviews:
            print(f"No more reviews found on page {page}.")
            break

        # Append reviews and ratings to the all_data list along with the product link
        for review, rating in zip(reviews, ratings):
            all_data.append({
                'Product_Link': product_link,
                'Review': review,
                'Rating': rating
            })

        time.sleep(5)  # Wait before loading the next reviews page

# Close the driver after scraping is complete
driver.quit()

# Convert the collected data into a DataFrame and save it as a CSV file
result_df = pd.DataFrame(all_data)

# Save to CSV
output_file = r"D:\Project\Flipkart SA\google_reviews_ratings_all_products.csv"
result_df.to_csv(output_file, index=False)

print(f"Scraping complete. Data saved to {output_file}")


Scraping product 1/10: https://www.flipkart.com/google-pixel-7a-charcoal-128-gb/p/itmb4d7b100b1a4d?pid=MOBGZCQMHGWDYZQ7&lid=LSTMOBGZCQMHGWDYZQ7XLJASQ&marketplace=FLIPKART&q=google+mobiles&store=tyy%2F4io&srno=s_1_1&otracker=AS_Query_HistoryAutoSuggest_1_14_na_na_na&otracker1=AS_Query_HistoryAutoSuggest_1_14_na_na_na&iid=09f11e69-32b0-4b6a-923d-af81bc54bd88.MOBGZCQMHGWDYZQ7.SEARCH&ssid=6woo4r91pc0000001727628620394&qH=7961bfd86f1fa98c
Scraping product 2/10: https://www.flipkart.com/google-pixel-7a-snow-128-gb/p/itmb4d7b100b1a4d?pid=MOBGZCQMZXYXCZCH&lid=LSTMOBGZCQMZXYXCZCHTTFGG3&marketplace=FLIPKART&q=google+mobiles&store=tyy%2F4io&srno=s_1_2&otracker=AS_Query_HistoryAutoSuggest_1_14_na_na_na&otracker1=AS_Query_HistoryAutoSuggest_1_14_na_na_na&iid=09f11e69-32b0-4b6a-923d-af81bc54bd88.MOBGZCQMZXYXCZCH.SEARCH&ssid=6woo4r91pc0000001727628620394&qH=7961bfd86f1fa98c
Scraping product 3/10: https://www.flipkart.com/google-pixel-7a-coral-128-gb/p/itmb4d7b100b1a4d?pid=MOBGT5F26QJYZUZS&lid=LSTMOBG

In [36]:
result_df

Unnamed: 0,Product_Link,Review,Rating
0,https://www.flipkart.com/google-pixel-7a-charc...,"Its extraordinary, something new, everything t...",4
1,https://www.flipkart.com/google-pixel-7a-charc...,Google need to fix the battery issue and heati...,5
2,https://www.flipkart.com/google-pixel-7a-charc...,Nice design and colour camer is at its peak as...,5
3,https://www.flipkart.com/google-pixel-7a-charc...,Nice Phone,4
4,https://www.flipkart.com/google-pixel-7a-charc...,"Very good phone photography, average mobile pe...",4
5,https://www.flipkart.com/google-pixel-7a-charc...,Camera Is Excellent.\nProcessor is good for li...,4
6,https://www.flipkart.com/google-pixel-7a-charc...,Migrated from iPhone 14 to Pixel7a. Amazing Pi...,5
7,https://www.flipkart.com/google-pixel-7a-charc...,"Reviewing after 6 days, purchased 2 Pixel 7a (...",5
8,https://www.flipkart.com/google-pixel-7a-charc...,A great camera phone. The colour science is pr...,5
9,https://www.flipkart.com/google-pixel-7a-charc...,"I'm using pixel since 4months, camera is aweso...",3


In [34]:
pd.set_option("display.max_rows", None)

In [1]:
import pandas as pd
df = pd.read_csv("D:\\Project\\Flipkart SA\\google_reviews_ratings_all_products.csv")

In [3]:
df1 = pd.read_csv("D:\\Project\\Flipkart SA\\flipkart_cleaned_google.csv")

In [5]:
df = pd.merge(df, df1, on = 'Product_Link', how='inner')

In [7]:
df.columns

Index(['Product_Link', 'Review', 'Rating', 'Product_Name', 'Product_Price'], dtype='object')

In [7]:
output_file = r"D:\Project\Flipkart SA\google_merged.csv"
df.to_csv(output_file, index=False)

In [13]:
import pandas as pd
df = pd.read_csv("D:\Project\Flipkart SA\google_merged.csv")

In [15]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Cleaning function
def clean_text(text):
    
    # Replace curly apostrophe ’ with straight apostrophe '
    text = text.replace("’", "'")
    
    # Remove bullet points and other unnecessary symbols, keeping only regular apostrophes
    text = re.sub(r"[^a-zA-Z\s']", '', text)

    # Remove unnecessary apostrophes: e.g., at the end of words like "best'"
    text = re.sub(r"\b'\b|'\B|\B'", '', text)
    
    # Lowercase the text
    text = text.lower()
    
    # Remove numbers (optional, keep this if you don't want numbers)
    text = re.sub(r'\d+', '', text)
    
    # Tokenize the text
    tokens = text.split()
    
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    return ' '.join(tokens)

# Apply the cleaning function to each review
df['Review'] = df['Review'].apply(clean_text)


In [17]:
import re

# Function to remove emojis
def remove_emojis(text):
    # Regular expression to match emojis
    
    emoji_pattern = re.compile(
        "[\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F700-\U0001F77F"  # alchemical symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002700-\U000027BF"  # Dingbats
        "\U000024C2-\U0001F251" 
        "]+", flags=re.UNICODE)
    
    return emoji_pattern.sub(r'', text)  # Remove emojis

# Apply the function to the 'Review' column
df['Review'] = df['Review'].apply(remove_emojis)

In [19]:
from autocorrect import Speller

# Initialize the spell checker for English
spell = Speller(lang='en')  # Specify English language

# Function to correct spelling in a review
def correct_spelling(text):
    # Check if the input is a string
    if isinstance(text, str):
        return ' '.join([spell(word) for word in text.split()])
    return text  # Return as is for non-string inputs

# Sample DataFrame creation (replace this with your actual DataFrame)
# df = pd.DataFrame({'Review': ['Ths is a smaple revie', 'Another revw', None, 'Amazing featur']})

# Apply the spelling correction function to the 'Review' column
df['Review'] = df['Review'].apply(correct_spelling)


In [21]:
import pandas as pd
import re

# Custom dictionary of words to remove or replace
custom_dict = {
    'costa': 'costar', 'beast': 'best', 'hr': 'hour', 'astutu': 'antutu', 'sha': 'should', 'realme': 'realme', 'wk': 'week', 
    "hadnt": "had not", 'oneplus': 'one plus', "im": "i am", 'w': 'watt',  "havent": "have not", 
    "hasnt": "has not", 'u': "you", 'r': "are", "mp": 'million pixels', "ai": "artificial intelligence", 
    "ui": "user interface", "doesnt": "does not", 'rambling': "rumbling", "o": "operating system", "io": "iphone operating system", 
    "sot": "special operation team", "le": "less", "fhd": "full high definition", "dont": "do not", "mb": "megabyte",
    "cam": "camera", "came": "camera", "avg": "average", "min": 'minutes',  "yea": "yeah", "lil": "little", 
    "costlier": "more expensive", "it's": "it is", "very": "very", "n": "and", "cant": "cannot", "dis": "this", "v": "we", 
    "hdr": 'high dynamic range', "didnt": "did not", "ive": "i have", "bezel": "bezel", "ur": "your", "wont": 'will not', 
    "hd": 'high definition', 'cleanui': "clean user interface", "tatics": "haptics", "sd": "secure digital", "gen": "generation", 
    "usp": 'unique selling proposition', 'degc': "degree celsius", "tatic": "haptic", "unbuilt": "inbuilt", 'xiomi': "xiaomi",
    'regreating': "regretting", 'fyi': "for your information", 'issuehrs': "issue hours", 'doomed': "zoomed", 'fps': "frames per second", 
    'ois': "optical image stabilization", 'theyll': "they will", 'ig': "instagram", 'bbd': "bigger better deal", 'cemra': "camera", 
    'fastly': "fast", "optimise": 'optimize', 'osum': 'awesome', 'vi': "vodafone", 'upi': "unified payments interface", 'dxomark': 'dxomark', 
    'eyeturner': "eye turner", 'banger': "banger", 're': "resolution", 'goddamn': "goddamn", 'aint': "am not", 'plesently': "pleasantly", 
    'thik': "think", "tooo": "too", "uisvery": 'user interface very', 'gif': "graphics interchange format", 'siz': "six", 
    'iphones': "iphone", 'youre': 'you are', 'doubtbut': "doubt but", 'phome': "phone", 'red': "redmi", 
    'okif': "okay if", 'pic': "picture", 'smatter': 'smarter', 'membrane': "ambrane", "holdnew": 'hold new', 'swine': "swipe", 
    'inshot': "inshot", 'pixelated': "pixel related", 'cameraai': "camera artificial intelligence", 'oppos': 'oppo', 'amaze': "amazing", 
    'daytoday': "day to day", 'offmy': "off my", 'laggy': "lag", 'victus': "victus", 'slowlike': "slow like", 'opp': "oppo", 
    'wholeday': "full day", 'hdfc': "hdfc", 'wil': "will", 'commendable': "recommendable", "kinda": 'kind of', 'baku': "vaku", 
    'beat': "best", 'surfed': "suffered", 'bgmi': "battle ground mobile india", 'pubg': "pubg", 'isnt': "is not", 
    'flickering': "flickering", 'least': "at least", 'doun': "down", 'thats': "that is", 'ill': "i will", 'bout': "about", 
    'overrated': "overrated", 'butter': "better", 'emi': "equated monthly installment", 'prefect': "perfect", 'ketone': "keyone", 
    'ie': "in other words", 'cuz': "because", 'mah': "milliampere hour", 'asus': "asus", "youve": "you have", 'ott': "over the top", 
    'oct': "october", 'ip': "iphone", 'nowhope': "now hope", 'eraserunblur': "eraser focus", "wifi": "wireless fidelity", 
    'ok': "okay", 'hiccup': "hiccup", 'slowness': "slowness", 'janso': "january so", 'mahmaybe': "milliampere hour may be", 
    'lovable': "lovable", 'etc': "et cetera", "oplus": "oneplus", "karma": "varma"
}

# List of words to remove
remove_list = ['k', 'science', 'x', 'nd', 'rd', 'le', 'st', 'p', 'pm', 'f', 'tho', 'h', 'th', 'gn', 'xr', 'xl', 'am', 'tg', 'p', 'z', 'cc', 'g', 'hz']

# Function to clean the text
def clean_text(text, custom_dict, remove_list):
    # Remove specified words from remove_list
    pattern_remove = r'\b(' + '|'.join(re.escape(word) for word in remove_list) + r')\b'
    cleaned_text = re.sub(pattern_remove, '', text, flags=re.IGNORECASE)

    # Replace words according to custom_dict
    for word, replacement in custom_dict.items():
        cleaned_text = re.sub(r'\b' + re.escape(word) + r'\b', replacement, cleaned_text, flags=re.IGNORECASE)
    
    return cleaned_text.strip()

# Assuming 'df' is your DataFrame with a 'Review' column
df['Review'] = df['Review'].apply(lambda x: clean_text(x, custom_dict, remove_list))


In [23]:
output_file = r"D:\Project\Flipkart SA\google_text_cleaned.csv"
df.to_csv(output_file, index=False)