In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

### Web Scraping from DARAZ 

In [5]:

# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--disable-blink-features=AutomationControlled")  # Disable automation flag
driver = webdriver.Chrome(options=options)

# URL of the Daraz product page
url = 'https://www.daraz.lk/products/swisstek-multi-purpose-ladder-20ft-i113288341-s1024286031.html'
driver.get(url)

# Wait for the page to load
time.sleep(5)

# Scroll to load reviews (if needed)
for _ in range(3):  # Adjust the number of scrolls based on the number of reviews
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)

# Wait for the reviews section to load
try:
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, 'item-content'))  # Wait for reviews to load
    )
except Exception as e:
    print("Reviews section not found:", e)
    driver.quit()
    exit()

# Find the reviews section
reviews = driver.find_elements(By.CLASS_NAME, 'item-content')  # Update class name if needed

# Extract review text (comments) only
review_data = []
for review in reviews:
    try:
        review_text = review.find_element(By.CLASS_NAME, 'content').text.strip()  # Extract review text
        review_data.append({
            'review_text': review_text
        })
    except Exception as e:
        # Skip reviews with missing data
        print(f"Error extracting review: {e}")
        continue

# Convert to DataFrame
df = pd.DataFrame(review_data)

# Save to CSV
df.to_csv('../Data/webscraped/daraz/reviews.csv', index=False)
print("Reviews saved to 'daraz_reviews.csv'")

# Close the browser
driver.quit()

Error extracting review: Message: no such element: Unable to locate element: {"method":"css selector","selector":".content"}
  (Session info: chrome=133.0.6943.142); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF7AF0CC6A5+28789]
	(No symbol) [0x00007FF7AF035B20]
	(No symbol) [0x00007FF7AEEC8F9A]
	(No symbol) [0x00007FF7AEF1F346]
	(No symbol) [0x00007FF7AEF1F57C]
	(No symbol) [0x00007FF7AEF11CFC]
	(No symbol) [0x00007FF7AEF4736F]
	(No symbol) [0x00007FF7AEF11BC6]
	(No symbol) [0x00007FF7AEF47540]
	(No symbol) [0x00007FF7AEF6F7E3]
	(No symbol) [0x00007FF7AEF47103]
	(No symbol) [0x00007FF7AEF0FFC0]
	(No symbol) [0x00007FF7AEF11273]
	GetHandleVerifier [0x00007FF7AF411AED+3458237]
	GetHandleVerifier [0x00007FF7AF42829C+3550316]
	GetHandleVerifier [0x00007FF7AF41DB9D+3507565]
	GetHandleVerifier [0x00007FF7AF192C6A+841274]
	(No symbol) [0x00007FF7AF0409EF

### Some data webscrape from the daraz directly and some are entered manually

In [None]:
"""
## Combining seperate csv files into one file

import os
import pandas as pd
import numpy as np

# Define the folder where your CSV files are stored
folder_path = "../Data/webscraped"  #('../data/webscrape_reviews/reviews.csv', index=False)

# List all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Initialize an empty list to store DataFrames
dataframes = []

# Loop through each file and process
for file in csv_files:
    # Construct full file path
    file_path = os.path.join(folder_path, file)

    # Read the CSV file
    df = pd.read_csv(file_path)

    # Extract product type and size from filename (assuming format "productType_size.csv")
    product_type, size = file.replace(".csv", "").split("_")

    # Add new columns
    df["product_type"] = product_type
    df["size"] = size

    # Append to list
    dataframes.append(df)

# Combine all DataFrames
final_df = pd.concat(dataframes, ignore_index=True)

# Add a unique ID column
final_df["ID"] = range(1, len(final_df) + 1)

# Select the required columns
final_df = final_df[["ID", "product_type", "size","review_text","rating"]]


# Save to a new CSV file
final_df.to_csv("../Data/webscraped/combined_reviews.csv", index=False)

print("Dataset successfully created!")

"""

Dataset successfully created!


### Combining dataset and translate using Google Translate API

In [6]:
import os
from googletrans import Translator

# Initialize translator
translator = Translator()

def translate_text(text):
    """Detect language and translate Sinhala text into English."""
    try:
        detected_lang = translator.detect(text).lang
        if detected_lang == "si":  # Sinhala language code
            return translator.translate(text, src="si", dest="en").text
        return text  # Keep English text unchanged
    except:
        return text  # Return original text if error occurs

# Define folder path
folder_path = "../Data/webscraped"

# List all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Initialize an empty list to store DataFrames
dataframes = []

# Loop through each file and process
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)

    # Extract product type and size from filename
    product_type, size = file.replace(".csv", "").split("_")

    # Add new columns
    df["product_type"] = product_type
    df["size"] = size

    # Translate review_text
    df["translated_review"] = df["review_text"].astype(str).apply(translate_text)

    # Append to list
    dataframes.append(df)

# Combine all DataFrames
final_df = pd.concat(dataframes, ignore_index=True)

# Add a unique ID column
final_df["ID"] = range(1, len(final_df) + 1)

# Select final columns
final_df = final_df[["ID", "product_type", "size", "review_text", "translated_review", "rating"]]

# Save to new CSV file
final_df.to_csv("../Data/google_API/translated_reviews.csv", index=False)

print("Translated dataset successfully created!")


Translated dataset successfully created!


### ML model

In [None]:
"""
from transformers import MarianMTModel, MarianTokenizer

# Load pre-trained translation model (Sinhala → English)
model_name = "Helsinki-NLP/opus-mt-si-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def translate_sinhala(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    translated_tokens = model.generate(**inputs)
    return tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

# Apply translation to Sinhala reviews
final_df["../Data/google_API/translated_review"] = final_df["review_text"].apply(translate_sinhala)

# Save the dataset
final_df.to_csv("translated_reviews.csv", index=False)

"""

'\nfrom transformers import MarianMTModel, MarianTokenizer\n\n# Load pre-trained translation model (Sinhala → English)\nmodel_name = "Helsinki-NLP/opus-mt-si-en"\ntokenizer = MarianTokenizer.from_pretrained(model_name)\nmodel = MarianMTModel.from_pretrained(model_name)\n\ndef translate_sinhala(text):\n    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)\n    translated_tokens = model.generate(**inputs)\n    return tokenizer.decode(translated_tokens[0], skip_special_tokens=True)\n\n# Apply translation to Sinhala reviews\nfinal_df["translated_review"] = final_df["review_text"].apply(translate_sinhala)\n\n# Save the dataset\nfinal_df.to_csv("translated_reviews.csv", index=False)\n\n'

In [7]:
import pandas as pd

# Load the translated dataset
df = pd.read_csv("../Data/google_API/translated_reviews.csv")

# Rename the translated column to 'text'
df.rename(columns={"translated_review": "text"}, inplace=True)

# Select required columns
final_df = df[["ID", "product_type", "size", "text", "rating"]]

# Save the new CSV file
final_df.to_csv("../Data/google_API/reviews.csv", index=False)

print("Review dataset successfully created!")


Review dataset successfully created!
