<a href="https://colab.research.google.com/github/SrijaGuduru/SrijaGuduru/blob/main/2203A51743%3E_%3CBACH_25%3E_Lab10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob

# Install and download necessary NLTK data
import nltk
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load dataset from Google Drive (if using Google Colab)
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Define the path to the dataset
path = '/content/drive/My Drive/Colab Notebooks/IRS Project/'
# The delimiter is changed to ',' to correctly parse the columns
df = pd.read_csv(path + 'AMAZON_FASHION_5_part0.csv', encoding='latin1', delimiter=",")

# Ensure correct column names (adjust based on actual dataset structure)
df.columns = ['asin', 'reviewText', 'overall', 'category', 'summary']

# Text Preprocessing function
def clean_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation and numbers
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

# Apply text cleaning
df['cleaned_review'] = df['reviewText'].apply(clean_text)

# Sentiment Analysis using TextBlob
def get_sentiment(text):
    polarity = TextBlob(text).sentiment.polarity
    if polarity >= 0.1:
        return 'Positive'
    elif polarity <= -0.1:
        return 'Negative'
    else:
        return 'Neutral'

df['sentiment'] = df['cleaned_review'].apply(get_sentiment)

# Compare sentiment with actual ratings
def map_rating_to_sentiment(rating):
    if rating >= 4:
        return 'Positive'
    elif rating == 3:
        return 'Neutral'
    else:
        return 'Negative'

df['actual_sentiment'] = df['overall'].apply(map_rating_to_sentiment)

# Calculate accuracy
accuracy = (df['sentiment'] == df['actual_sentiment']).mean()
print(f'Sentiment Analysis Accuracy: {accuracy * 100:.2f}%')

# Display sample results
print(df[['reviewText', 'sentiment', 'actual_sentiment']].head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Mounted at /content/drive
Sentiment Analysis Accuracy: 75.52%
                                          reviewText sentiment  \
0                           Great product and price!  Positive   
1      Waaay too small. Will use for futur children!  Negative   
2                    Stays vibrant after many washes  Positive   
3                    Stays vibrant after many washes  Positive   
4  My son really likes the pink. Ones which I was...   Neutral   

  actual_sentiment  
0         Positive  
1          Neutral  
2         Positive  
3         Positive  
4         Positive  
