In [3]:
import requests
from bs4 import BeautifulSoup
import csv
import time

# Initialize the list to store item data
items = []

# Loop through pages until 200 items are collected
page = 1
while len(items) < 200:
    url = f"https://www.noon.com/uae-en/sports-and-outdoors/exercise-and-fitness/yoga-16328/{page}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml')

    # Find all product divs
    product_divs = soup.find_all('div', '_2kHMtA')

    for product in product_divs:
        if len(items) >= 200:  # Stop if 200 items are collected
            break

        try:
            # Extract fields
            datetime = product.find('span', class_='date-class').text if product.find('span', class_='date-class') else "N/A"
            sku = product.get('data-sku', "N/A")
            name = product.find('div', '_4rR01T').text if product.find('div', '_4rR01T') else "N/A"
            brand = product.find('div', class_='brand-class').text if product.find('div', class_='brand-class') else "N/A"
            average_rating = product.find('div', '_3LWZlK').text if product.find('div', '_3LWZlK') else "N/A"
            rating_count = product.find('span', class_='rating-count-class').text if product.find('span', class_='rating-count-class') else "N/A"
            old_price = product.find('div', class_='old-price-class').text.replace('₹', '').replace(',', '') if product.find('div', class_='old-price-class') else "N/A"
            new_price = product.find('div', '_30jeq3 _1_WHN1').text.replace('₹', '').replace(',', '') if product.find('div', '_30jeq3 _1_WHN1') else "N/A"
            rank = product.find('span', class_='rank-class').text if product.find('span', class_='rank-class') else "N/A"
            item_link = product.find('a', href=True)['href'] if product.find('a', href=True) else "N/A"

            # Append to items list
            items.append({
                'datetime': datetime,
                'sku': sku,
                'name': name,
                'brand': brand,
                'average_rating': average_rating,
                'rating_count': rating_count,
                'old_price': old_price,
                'new_price': new_price,
                'rank': rank,
                'item_link': f"https://www.noon.com{item_link}"
            })
        except Exception as e:
            print(f"Error processing product: {e}")

    print(f"Page {page} processed, total items collected: {len(items)}")
    page += 1
    time.sleep(2)  # Add delay to avoid overwhelming the server

# Save data to a CSV file
csv_file = "noon_items.csv"
fieldnames = ['datetime', 'sku', 'name', 'brand', 'average_rating', 'rating_count', 'old_price', 'new_price', 'rank', 'item_link']

with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(items)

print(f"Data saved to {csv_file}")


In [None]:
from bs4 import BeautifulSoup
import requests
import csv

# Initialize list for storing product details
products = []

# Sample URL (update with your URL)
url = "https://www.noon.com/uae-en/sports-and-outdoors/exercise-and-fitness/yoga-16328/"

# Fetch and parse the page
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Locate all product containers
product_containers = soup.find_all('div', class_='sc-57fe1f38-1')

for product in product_containers:
    try:
        # Extract product details
        name = product.find('div', {'data-qa': 'product-name'}).get('title', 'N/A')
        price = product.find('strong', class_='amount').text if product.find('strong', class_='amount') else 'N/A'
        old_price = product.find('span', class_='oldPrice').text if product.find('span', class_='oldPrice') else 'N/A'
        discount = product.find('span', class_='discount').text if product.find('span', class_='discount') else 'N/A'
        image_url = product.find('div', class_='sc-d8caf424-2').img['src'] if product.find('div', class_='sc-d8caf424-2') else 'N/A'
        delivery_info = product.find('div', class_='sc-4d61bf64-3').text.strip() if product.find('div', class_='sc-4d61bf64-3') else 'N/A'
        noon_express = product.find('div', {'data-qa': 'product-noon-express'}).text.strip() if product.find('div', {'data-qa': 'product-noon-express'}) else 'N/A'
        product_link = product.find('a', href=True)['href'] if product.find('a', href=True) else 'N/A'

        # Append details to list
        products.append({
            'Name': name,
            'Price': price,
            'Old Price': old_price,
            'Discount': discount,
            'Image URL': image_url,
            'Delivery Info': delivery_info,
            'Noon Express': noon_express,
            'Product Link': f"https://www.noon.com{product_link}"
        })
    except Exception as e:
        print(f"Error extracting product: {e}")

# Save to CSV
csv_file = "products.csv"
fieldnames = ['Name', 'Price', 'Old Price', 'Discount', 'Image URL', 'Delivery Info', 'Noon Express', 'Product Link']

with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(products)

print(f"Data saved to {csv_file}")


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax

# Preprocess the tweet
def preprocess_tweet(tweet):
    tweet_words = []
    for word in tweet.split(' '):
        if word.startswith('@') and len(word) > 1:
            word = '@user'
        elif word.startswith('http'):
            word = 'http'
        tweet_words.append(word)
    return ' '.join(tweet_words)

# Load model and tokenizer
roberta = "cardiffnlp/twitter-roberta-base-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(roberta)
tokenizer = AutoTokenizer.from_pretrained(roberta)

labels = ['Negative', 'Neutral', 'Positive']

def sentiment_analysis():
    # Get user input
    tweet = input("Enter a tweet or sentence: ")

    # Preprocess the input
    tweet_proc = preprocess_tweet(tweet)

    # Perform sentiment analysis
    encoded_tweet = tokenizer(tweet_proc, return_tensors='pt')
    output = model(**encoded_tweet)

    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    # Print sentiment scores
    print("\nSentiment Analysis Results:")
    for i in range(len(scores)):
        l = labels[i]
        s = scores[i]
        print(f"{l}: {s:.4f}")

if __name__ == "__main__":
    sentiment_analysis()


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax

tweet = 'yohoho! my name is Paarth and I am happy 😉'

# precprcess tweet
tweet_words = []

for word in tweet.split(' '):
    if word.startswith('@') and len(word) > 1:
        word = '@user'
    
    elif word.startswith('http'):
        word = "http"
    tweet_words.append(word)

tweet_proc = " ".join(tweet_words)

# load model and tokenizer
roberta = "cardiffnlp/twitter-roberta-base-sentiment"

model = AutoModelForSequenceClassification.from_pretrained(roberta)
tokenizer = AutoTokenizer.from_pretrained(roberta)

labels = ['Negative', 'Neutral', 'Positive']

# sentiment analysis
encoded_tweet = tokenizer(tweet_proc, return_tensors='pt')
# output = model(encoded_tweet['input_ids'], encoded_tweet['attention_mask'])
output = model(**encoded_tweet)

scores = output[0][0].detach().numpy()
scores = softmax(scores)

for i in range(len(scores)):
    
    l = labels[i]
    s = scores[i]
    print(l,s)