In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import pandas as pd

# URL of the website
url = "https://www.flipkart.com/search?q=oneplus+mobile&as=on&as-show=on&otracker=AS_Query_HistoryAutoSuggest_1_3_na_na_na&otracker1=AS_Query_HistoryAutoSuggest_1_3_na_na_na&as-pos=1&as-type=HISTORY&suggestionId=oneplus+mobile&requestId=c55baef2-eac6-433d-b53d-fe89a5a3ec4d&as-searchtext=one&sort=recency_desc&p%5B%5D=facets.price_range.from%3D20000&p%5B%5D=facets.price_range.to%3DMax"
def initialize_driver():
    driver = webdriver.Chrome()  # Ensure ChromeDriver is in your PATH
    driver.maximize_window()
    return driver

def load_page(driver, url):
    driver.get(url)
    time.sleep(5)  # Wait for the page to load

# Function to scrape product names, links, and prices
def scrape_product_data(driver):
    product_names = [name.text for name in driver.find_elements(By.CLASS_NAME, 'KzDlHZ')]
    product_links = [link.get_attribute('href') for link in driver.find_elements(By.XPATH, '//a[@class="CGtC98"]')]
    product_prices = [price.text for price in driver.find_elements(By.CLASS_NAME, 'hl05eU')]  # Adjusted class name for prices
    
    # Return product data
    return product_names, product_links, product_prices

# Function to scrape multiple pages
def scrape_multiple_pages(driver, base_url, num_pages):
    all_product_names = []
    all_product_links = []
    all_product_prices = []
    
    for page in range(1, num_pages + 1):
        load_page(driver, f"{base_url}&page={page}")  # Update the URL to include the page number
        product_names, product_links, product_prices = scrape_product_data(driver)
        
        # Ensure the lists are of the same length before extending
        if len(product_names) == len(product_links) == len(product_prices):
            all_product_names.extend(product_names)
            all_product_links.extend(product_links)
            all_product_prices.extend(product_prices)
        else:
            print(f"Warning: Mismatched data on page {page}. Names: {len(product_names)}, Links: {len(product_links)}, Prices: {len(product_prices)}")

        time.sleep(5)  # Wait before loading the next page
    
    return all_product_names, all_product_links, all_product_prices

# Initialize WebDriver and scrape multiple pages
driver = initialize_driver()
all_product_names, all_product_links, all_product_prices = scrape_multiple_pages(driver, url, 2)  # Adjust number of pages as needed

# Close the driver
driver.quit()

# Create a DataFrame to store the results
df = pd.DataFrame({
    'Product_Name': all_product_names,
    'Product_Link': all_product_links,
    'Product_Price': all_product_prices  # Updated to include product prices
})

# Display or save the scraped data
df.head()  # Display the DataFrame
Output_path = "D:\\Project\\Flipkart SA\\flipkart_scrape_oneplus.csv"
df.to_csv(Output_path, index=False)
#df.to_csv('flipkart_scrape_redmi.csv', index=False)  # Save the scraped data to a CSV file

In [3]:
df = pd.read_csv("D:\\Project\\Flipkart SA\\flipkart_scrape_oneplus.csv")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Product_Name   48 non-null     object
 1   Product_Link   48 non-null     object
 2   Product_Price  48 non-null     object
dtypes: object(3)
memory usage: 1.3+ KB


In [7]:
df.head()

Unnamed: 0,Product_Name,Product_Link,Product_Price
0,"OnePlus Nord 4 5G (Mercurial Silver, 256 GB)",https://www.flipkart.com/oneplus-nord-4-5g-mer...,"₹35,263\n₹35,9992% off"
1,"OnePlus Nord 4 5G (Obsidian Midnight, 256 GB)",https://www.flipkart.com/oneplus-nord-4-5g-obs...,"₹35,499\n₹35,9991% off"
2,"OnePlus Nord 4 5G (Obsidian Midnight, 256 GB)",https://www.flipkart.com/oneplus-nord-4-5g-obs...,"₹32,880\n₹32,999"
3,"OnePlus Nord 4 5G (Mercurial Silver, 256 GB)",https://www.flipkart.com/oneplus-nord-4-5g-mer...,"₹32,880\n₹32,999"
4,"OnePlus Nord 4 5G (Oasis Green, 256 GB)",https://www.flipkart.com/oneplus-nord-4-5g-oas...,"₹31,990\n₹32,9993% off"


In [9]:
import re

# Function to extract and clean the price (remove rupee symbol and commas)
def extract_clean_price(price_string):
    # Find the rupee symbol followed by the price
    match = re.search(r'₹(\d[\d,]*)', price_string)
    if match:
        # Remove the rupee symbol and commas, and convert to an integer
        return int(match.group(1).replace(',', ''))
    return None

# Apply the function to the 'Price' column
df['Product_Price'] = df['Product_Price'].apply(extract_clean_price)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Product_Name   48 non-null     object 
 1   Product_Link   48 non-null     object 
 2   Product_Price  47 non-null     float64
dtypes: float64(1), object(2)
memory usage: 1.3+ KB


In [13]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [15]:
df

Unnamed: 0,Product_Name,Product_Link,Product_Price
0,"OnePlus Nord 4 5G (Mercurial Silver, 256 GB)",https://www.flipkart.com/oneplus-nord-4-5g-mercurial-silver-256-gb/p/itmed83e7926e3e5?pid=MOBH3YA8H6CEDSGB&lid=LSTMOBH3YA8H6CEDSGBQCNMY9&marketplace=FLIPKART&q=oneplus+mobile&store=tyy%2F4io&srno=s_1_1&otracker=AS_Query_HistoryAutoSuggest_1_3_na_na_na&otracker1=AS_Query_HistoryAutoSuggest_1_3_na_na_na&iid=ae0583ff-6e51-42e5-a6b9-64a000c40173.MOBH3YA8H6CEDSGB.SEARCH&ssid=5mk0smu76o0000001727772580415&qH=9f1419b282422384,35263.0
1,"OnePlus Nord 4 5G (Obsidian Midnight, 256 GB)",https://www.flipkart.com/oneplus-nord-4-5g-obsidian-midnight-256-gb/p/itmed83e7926e3e5?pid=MOBH3YA8PDNXWUGA&lid=LSTMOBH3YA8PDNXWUGAMBVMNK&marketplace=FLIPKART&q=oneplus+mobile&store=tyy%2F4io&srno=s_1_2&otracker=AS_Query_HistoryAutoSuggest_1_3_na_na_na&otracker1=AS_Query_HistoryAutoSuggest_1_3_na_na_na&iid=ae0583ff-6e51-42e5-a6b9-64a000c40173.MOBH3YA8PDNXWUGA.SEARCH&ssid=5mk0smu76o0000001727772580415&qH=9f1419b282422384,35499.0
2,"OnePlus Nord 4 5G (Obsidian Midnight, 256 GB)",https://www.flipkart.com/oneplus-nord-4-5g-obsidian-midnight-256-gb/p/itmed83e7926e3e5?pid=MOBH3YA8RS8FVJZK&lid=LSTMOBH3YA8RS8FVJZKJVIKCR&marketplace=FLIPKART&q=oneplus+mobile&store=tyy%2F4io&srno=s_1_3&otracker=AS_Query_HistoryAutoSuggest_1_3_na_na_na&otracker1=AS_Query_HistoryAutoSuggest_1_3_na_na_na&iid=ae0583ff-6e51-42e5-a6b9-64a000c40173.MOBH3YA8RS8FVJZK.SEARCH&ssid=5mk0smu76o0000001727772580415&qH=9f1419b282422384,32880.0
3,"OnePlus Nord 4 5G (Mercurial Silver, 256 GB)",https://www.flipkart.com/oneplus-nord-4-5g-mercurial-silver-256-gb/p/itmed83e7926e3e5?pid=MOBH3YA8UXSNRSXW&lid=LSTMOBH3YA8UXSNRSXWSKVQHH&marketplace=FLIPKART&q=oneplus+mobile&store=tyy%2F4io&srno=s_1_4&otracker=AS_Query_HistoryAutoSuggest_1_3_na_na_na&otracker1=AS_Query_HistoryAutoSuggest_1_3_na_na_na&iid=ae0583ff-6e51-42e5-a6b9-64a000c40173.MOBH3YA8UXSNRSXW.SEARCH&ssid=5mk0smu76o0000001727772580415&qH=9f1419b282422384,32880.0
4,"OnePlus Nord 4 5G (Oasis Green, 256 GB)",https://www.flipkart.com/oneplus-nord-4-5g-oasis-green-256-gb/p/itmed83e7926e3e5?pid=MOBH3YA8ZFTVSREP&lid=LSTMOBH3YA8ZFTVSREPRVVVBR&marketplace=FLIPKART&q=oneplus+mobile&store=tyy%2F4io&srno=s_1_5&otracker=AS_Query_HistoryAutoSuggest_1_3_na_na_na&otracker1=AS_Query_HistoryAutoSuggest_1_3_na_na_na&iid=ae0583ff-6e51-42e5-a6b9-64a000c40173.MOBH3YA8ZFTVSREP.SEARCH&ssid=5mk0smu76o0000001727772580415&qH=9f1419b282422384,31990.0
5,"OnePlus 12R (Sunset Dune, 256 GB)",https://www.flipkart.com/oneplus-12r-sunset-dune-256-gb/p/itm347349f7db2f2?pid=MOBH3YC32ETRHX2H&lid=LSTMOBH3YC32ETRHX2HL4N1HL&marketplace=FLIPKART&q=oneplus+mobile&store=tyy%2F4io&srno=s_1_6&otracker=AS_Query_HistoryAutoSuggest_1_3_na_na_na&otracker1=AS_Query_HistoryAutoSuggest_1_3_na_na_na&iid=ae0583ff-6e51-42e5-a6b9-64a000c40173.MOBH3YC32ETRHX2H.SEARCH&ssid=5mk0smu76o0000001727772580415&qH=9f1419b282422384,39836.0
6,"OnePlus Nord CE4 lite 5G (MEGA BLUE, 128 GB)",https://www.flipkart.com/oneplus-nord-ce4-lite-5g-mega-blue-128-gb/p/itm8fd5fdf300955?pid=MOBH25Z9CCTDDN3J&lid=LSTMOBH25Z9CCTDDN3JZJFJER&marketplace=FLIPKART&q=oneplus+mobile&store=tyy%2F4io&srno=s_1_7&otracker=AS_Query_HistoryAutoSuggest_1_3_na_na_na&otracker1=AS_Query_HistoryAutoSuggest_1_3_na_na_na&iid=ae0583ff-6e51-42e5-a6b9-64a000c40173.MOBH25Z9CCTDDN3J.SEARCH&ssid=5mk0smu76o0000001727772580415&qH=9f1419b282422384,20990.0
7,"OnePlus Nord CE4 lite 5G (SUPER SILVER, 128 GB)",https://www.flipkart.com/oneplus-nord-ce4-lite-5g-super-silver-128-gb/p/itm8fd5fdf300955?pid=MOBH25ZDPHNF38XJ&lid=LSTMOBH25ZDPHNF38XJHBWOSI&marketplace=FLIPKART&q=oneplus+mobile&store=tyy%2F4io&srno=s_1_8&otracker=AS_Query_HistoryAutoSuggest_1_3_na_na_na&otracker1=AS_Query_HistoryAutoSuggest_1_3_na_na_na&iid=ae0583ff-6e51-42e5-a6b9-64a000c40173.MOBH25ZDPHNF38XJ.SEARCH&ssid=5mk0smu76o0000001727772580415&qH=9f1419b282422384,20490.0
8,"OnePlus Nord CE4 lite 5G (ULTRA ORANGE, 128 GB)",https://www.flipkart.com/oneplus-nord-ce4-lite-5g-ultra-orange-128-gb/p/itm8fd5fdf300955?pid=MOBH25ZEZBY3ZGBF&lid=LSTMOBH25ZEZBY3ZGBFLJ8FCI&marketplace=FLIPKART&q=oneplus+mobile&store=tyy%2F4io&srno=s_1_9&otracker=AS_Query_HistoryAutoSuggest_1_3_na_na_na&otracker1=AS_Query_HistoryAutoSuggest_1_3_na_na_na&iid=ae0583ff-6e51-42e5-a6b9-64a000c40173.MOBH25ZEZBY3ZGBF.SEARCH&ssid=5mk0smu76o0000001727772580415&qH=9f1419b282422384,20377.0
9,"OnePlus Nord CE4 lite 5G (MEGA BLUE, 256 GB)",https://www.flipkart.com/oneplus-nord-ce4-lite-5g-mega-blue-256-gb/p/itm8fd5fdf300955?pid=MOBH25ZFCUBEFZ4X&lid=LSTMOBH25ZFCUBEFZ4XNFXP8N&marketplace=FLIPKART&q=oneplus+mobile&store=tyy%2F4io&srno=s_1_10&otracker=AS_Query_HistoryAutoSuggest_1_3_na_na_na&otracker1=AS_Query_HistoryAutoSuggest_1_3_na_na_na&iid=ae0583ff-6e51-42e5-a6b9-64a000c40173.MOBH25ZFCUBEFZ4X.SEARCH&ssid=5mk0smu76o0000001727772580415&qH=9f1419b282422384,21717.0


In [17]:
df = df.dropna()

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, 0 to 47
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Product_Name   47 non-null     object
 1   Product_Link   47 non-null     object
 2   Product_Price  47 non-null     int32 
dtypes: int32(1), object(2)
memory usage: 1.3+ KB


In [21]:
df['Product_Price'] = df['Product_Price'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Product_Price'] = df['Product_Price'].astype(int)


In [25]:
df = df[df['Product_Price'] <= 40000]

In [27]:
Output_path = "D:\\Project\\Flipkart SA\\flipkart_cleaned_oneplus.csv"
df.to_csv(Output_path, index=False)

In [31]:
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException

# Load the CSV file containing product links
file_path = r"D:\Project\Flipkart SA\flipkart_cleaned_oneplus.csv"
df = pd.read_csv(file_path)

# Initialize the Chrome driver
driver = webdriver.Chrome()

# Function to extract reviews and ratings from a product page
def extract_reviews_and_ratings(driver):
    reviews = []
    ratings = []

    # Wait for the reviews section to load
    wait = WebDriverWait(driver, 10)
    try:
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'ZmyHeo')))
    except TimeoutException:
        print("Timed out waiting for reviews to load.")
        return reviews, ratings  # Return empty lists if timed out

    # Extract reviews
    review_elements = driver.find_elements(By.CLASS_NAME, "ZmyHeo")
    for element in review_elements:
        try:
            # Click "Read More" if available
            read_more = element.find_elements(By.CLASS_NAME, "b4x-fr")
            if read_more:
                driver.execute_script("arguments[0].click();", read_more[0])
                time.sleep(1)  # Wait for the full review to load
            
            reviews.append(element.text)
        except StaleElementReferenceException:
            continue

    # Extract star ratings
    rating_elements = driver.find_elements(By.CLASS_NAME, "XQDdHH.Ga3i8K")
    for i in range(max(len(reviews), len(rating_elements))):
        # Append ratings or None if not available
        if i < len(rating_elements):
            ratings.append(rating_elements[i].text)
        else:
            ratings.append(None)  # Placeholder for missing ratings

    return reviews, ratings

# Function to load the page with the correct page number in the URL
def load_page(driver, url):
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    try:
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'ZmyHeo')))
    except TimeoutException:
        print("Timed out waiting for reviews page to load.")

# Scrape reviews and ratings for all product links
all_data = []

num_pages_reviews = 20  # Number of review pages to scrape per product

# Loop through each product link in the DataFrame
for index, product_link in df['Product_Link'].items():
    print(f"Scraping product {index + 1}/{len(df)}: {product_link}")
    
    # Scrape reviews for the current product
    driver.get(product_link)
    time.sleep(5)  # Wait for the product page to load

    # Click on the 'All Reviews' button if it exists
    try:
        wait = WebDriverWait(driver, 10)
        all_reviews_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, '_23J90q.RcXBOT')))
        all_reviews_button.click()
        time.sleep(5)  # Wait for the reviews page to load
    except TimeoutException:
        print(f"Warning: Could not find 'All Reviews' button for {product_link}. Continuing to next product.")
        continue  # Skip to the next product if no reviews found

    # Scrape reviews and ratings across multiple review pages
    for page in range(1, num_pages_reviews + 1):
        if page > 1:
            # Update the URL to navigate to the next page of reviews
            page_url = f"{driver.current_url}&page={page}"
            load_page(driver, page_url)
            time.sleep(8)

        reviews, ratings = extract_reviews_and_ratings(driver)
        
        # Check if reviews are empty, and if so, stop scraping further pages
        if not reviews:
            print(f"Warning: No more reviews found on page {page}")
            continue

        # Append reviews and ratings to the all_data list along with the product link
        for review, rating in zip(reviews, ratings):
            all_data.append({
                'Product_Link': product_link,
                'Review': review,
                'Rating': rating
            })

        time.sleep(5)  # Wait before loading the next reviews page

# Close the driver after scraping is complete
driver.quit()

# Convert the collected data into a DataFrame and save it as a CSV file
result_df = pd.DataFrame(all_data)

# Save to CSV
output_file = r"D:\Project\Flipkart SA\oneplus _reviews_ratings_all_products.csv"
result_df.to_csv(output_file, index=False)

print(f"Scraping complete. Data saved to {output_file}")

Scraping product 1/32: https://www.flipkart.com/oneplus-nord-4-5g-mercurial-silver-256-gb/p/itmed83e7926e3e5?pid=MOBH3YA8H6CEDSGB&lid=LSTMOBH3YA8H6CEDSGBQCNMY9&marketplace=FLIPKART&q=oneplus+mobile&store=tyy%2F4io&srno=s_1_1&otracker=AS_Query_HistoryAutoSuggest_1_3_na_na_na&otracker1=AS_Query_HistoryAutoSuggest_1_3_na_na_na&iid=ae0583ff-6e51-42e5-a6b9-64a000c40173.MOBH3YA8H6CEDSGB.SEARCH&ssid=5mk0smu76o0000001727772580415&qH=9f1419b282422384
Scraping product 2/32: https://www.flipkart.com/oneplus-nord-4-5g-obsidian-midnight-256-gb/p/itmed83e7926e3e5?pid=MOBH3YA8PDNXWUGA&lid=LSTMOBH3YA8PDNXWUGAMBVMNK&marketplace=FLIPKART&q=oneplus+mobile&store=tyy%2F4io&srno=s_1_2&otracker=AS_Query_HistoryAutoSuggest_1_3_na_na_na&otracker1=AS_Query_HistoryAutoSuggest_1_3_na_na_na&iid=ae0583ff-6e51-42e5-a6b9-64a000c40173.MOBH3YA8PDNXWUGA.SEARCH&ssid=5mk0smu76o0000001727772580415&qH=9f1419b282422384
Scraping product 3/32: https://www.flipkart.com/oneplus-nord-4-5g-obsidian-midnight-256-gb/p/itmed83e7926e3

In [37]:
df = pd.read_csv("D:\Project\Flipkart SA\oneplus _reviews_ratings_all_products.csv")

In [39]:
df1 = pd.read_csv("D:\\Project\\Flipkart SA\\flipkart_cleaned_oneplus.csv")

In [41]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

In [43]:
df = pd.merge(df, df1, on = 'Product_Link', how='inner')

In [45]:
df.columns

Index(['Product_Link', 'Review', 'Rating', 'Product_Name', 'Product_Price'], dtype='object')

In [47]:
output_file = r"D:\Project\Flipkart SA\oneplus_merged.csv"
df.to_csv(output_file, index=False)

In [14]:
import pandas as pd
df = pd.read_csv("D:\Project\Flipkart SA\oneplus_merged.csv")

In [16]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Cleaning function
def clean_text(text):
    
    # Replace curly apostrophe ’ with straight apostrophe '
    text = text.replace("’", "'")
    
    # Remove bullet points and other unnecessary symbols, keeping only regular apostrophes
    text = re.sub(r"[^a-zA-Z\s']", '', text)

    # Remove unnecessary apostrophes: e.g., at the end of words like "best'"
    text = re.sub(r"\b'\b|'\B|\B'", '', text)
    
    # Lowercase the text
    text = text.lower()
    
    # Remove numbers (optional, keep this if you don't want numbers)
    text = re.sub(r'\d+', '', text)
    
    # Tokenize the text
    tokens = text.split()
    
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    return ' '.join(tokens)

# Apply the cleaning function to each review
df['Review'] = df['Review'].apply(clean_text)


In [18]:
import re

# Function to remove emojis
def remove_emojis(text):
    # Regular expression to match emojis
    
    emoji_pattern = re.compile(
        "[\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F700-\U0001F77F"  # alchemical symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002700-\U000027BF"  # Dingbats
        "\U000024C2-\U0001F251" 
        "]+", flags=re.UNICODE)
    
    return emoji_pattern.sub(r'', text)  # Remove emojis

# Apply the function to the 'Review' column
df['Review'] = df['Review'].apply(remove_emojis)

In [20]:
from autocorrect import Speller

# Initialize the spell checker for English
spell = Speller(lang='en')  # Specify English language

# Function to correct spelling in a review
def correct_spelling(text):
    # Check if the input is a string
    if isinstance(text, str):
        return ' '.join([spell(word) for word in text.split()])
    return text  # Return as is for non-string inputs

# Sample DataFrame creation (replace this with your actual DataFrame)
# df = pd.DataFrame({'Review': ['Ths is a smaple revie', 'Another revw', None, 'Amazing featur']})

# Apply the spelling correction function to the 'Review' column
df['Review'] = df['Review'].apply(correct_spelling)

In [22]:
import pandas as pd
import re


# Dictionary of specific abbreviations and their corrections
abbreviation_dict = {
    'im': 'i am', 'miami': 'xiaomi', 'realm': 'realme', 'frm': 'from', 'mob':'mobile', 'approx': 'approximately', 'ok': "okay", 'oplus': 'oneplus',
    'amp': "ampere", "gen": "generation", 'youre': 'you are', 'etc': 'et cetera', 'onepluss': 'oneplus', 'vary':'very', 'barry': 'battery', 'doesnt': "does not",
    'dont': 'do not', 'lvl': 'level', 'v': "we", 'sup': 'super', 'pr': 'product', 'doesnt': "doesnt", 'ill': "i will", 'tactic': 'haptic', 'avg': 'average',
    'uv': 'ultra violet', 'ie': 'in other words', 'vry': 'very', 'easilllyyy': 'easily', 'wont': 'will not', 'yr': 'year', 'kkk': 'okay', 'dslr': 'digital single lens reflex',
    'flipcard': 'flipkart', 'didnt': 'did not', 'wiki': 'wikipedia', 'havent': 'have not', 'mobil': 'mobile', 'prod': 'production', 'wifi': 'wireless fidelity',
    'nfc': "near field communication", 'plz': "please", 'perpomence': "performance", 'phn': "phone", 'camara': 'camera', 'fps': 'frames per second',
    'sd gen': 'snapdragon generation', 'superb': 'super', 'professor': 'processor', 'sound college': 'sound quality', 'dam': 'damn', 'opp': 'oppo', 'aws': 'awesome',
    'vry gd': 'very good', 'supppoppp': 'super', 'prosesar': 'processor', 'approx hr sot': 'approximately hour screen on time', 'u': 'you', 'hr': 'hour',
    'prosper': 'processor', 'usa': 'usage', 'good pro': 'good product', 'gd': 'good', 'nice prod': 'nice product', 'p': 'performance', 'ui': 'user interface',
    'day us': 'day usage', 'prosesar': 'processor', 'battery beast': 'battery best', 'ease': 'easy', 'pub': 'pubg', 'osm': 'awesome', 'worth karma worthuuu': 'worth the money',
    'ois': 'optical image stabilization', 'aim happy': 'i am happy', 'pub cod': 'pubg call of duty', 'degree census': 'degree celsius', 'mint camera': 'main camera',
    'nd': 'and', 'hit issue': 'heating issue', 'ai': 'artificial intelligence', 'wil': 'will','tax': 'thanks', 'phone devils aswoome': 'phone device is awesome','hdr': 'high dynamic range',
    'fo': 'for', 'anthem': 'item', 'heat throttle': 'heating and throttling', 'ill update': 'i will update', 'op ti really': 'oneplus really','one plus ph': 'oneplus phone',
    'spr': 'super', 'apprehension': 'appreciation', 'max': 'maximum', 'kinda': 'kind of', 'medio': 'mediocre', 'btw': 'by the way', 'assam': 'awesome', 
    'ph': 'phone', 'bos': 'range boost', 'beast': 'best', 'dlr': 'digital single lens reflex', 'batter': 'better', 'cod': 'call of duty', 'nyc': 'nice',
    'unvilevebale item': 'unbelievable item', 'supper': 'super', 'op': 'oneplus','assume': 'awesome','osm dolly atoms': 'awesome dolby atmos', 
    'mic failed': 'microphone failed','premium paper': 'premium feel','ver': 'very','gonna': 'going to','pub': 'pubg','gun': 'good','jus': 'just',
    'extent': 'excellent','nice hoon': 'nice phone','gr': 'great','math': 'match','bt': 'but'
}

# Replace abbreviations in the Reviews column
for abbrev, full_form in abbreviation_dict.items():
    # Use word boundaries to ensure only whole words are replaced
    df['Review'] = df['Review'].str.replace(r'\b' + re.escape(abbrev) + r'\b', full_form, regex=True)

# Dictionary of specific phrases and their corrections
phrase_replacement_dict = {
    'pais wasooolllllllllllllllllllllllllllllllll': 'worth the money',
    'jordan phone hai hai logo ska koi competition nhi hai camera sabre best hai im using last day yes u guy wanna buy plz spend get gb version u get complete satisfaction device': 'jordan phone has no competition, camera is the best, i am using for the last day, yes you guys wanna buy please spend and get the gb version, you will get complete satisfaction from the device',
}

# Replace specific phrases in the Reviews column
for phrase, replacement in phrase_replacement_dict.items():
    df['Review'] = df['Review'].str.replace(phrase, replacement, regex=True)

# List of unwanted words
unwanted_words = ['euuuuuuuuuuuuuu', 'imei','xx','o', 'p', 'k', 'x', 'w', 'lt','st', 'isbn', 'gb', 'min', 'mah', 'x x x', 'makhan', 'le', 'bim', 'rd', 
                  'bc','r', 'n', 'nd', 'mm', 'fish', 'apple ka bar', 'mp', 'ppi', 'yrr', 'imei', 'g', 'ce', 'ip', 'lt', 'td', 'bmi', 'gt', 'lea', 'la',
                 'der', 'ir', 'j', 'sp', 'th', 'v', 'cg', 'wee', 'seg', 'gor', 'yh', 'knox', 'uh', 'afeeee', 'mh', 'ing', 'ip', 'vfd', 'rom', 'ir', 'aaeivel',
                 ]

# Remove unwanted words from the Reviews column
def remove_unwanted_words(text):
    pattern = r'\b(' + '|'.join(re.escape(word) for word in unwanted_words) + r')\b'
    return re.sub(pattern, '', text)

df['Review'] = df['Review'].apply(remove_unwanted_words)


In [24]:
output_file = r"D:\Project\Flipkart SA\oneplus_text_cleaned.csv"
df.to_csv(output_file, index=False)