In [26]:
import numpy as np
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pymongo
from pymongo import MongoClient
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

# Initialize the SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
sia = SentimentIntensityAnalyzer()

# MongoDB connection
client = MongoClient("mongodb://localhost:27017/")  # Replace with your MongoDB URI
db = client["Project"]  # Replace with your database name
collection = db["pizza_business_feature_classification"]  # Replace with your collection name

# List of non-opinion related POS tags to filter out (nouns, pronouns, etc.)
non_opinion_pos_tags = {'NN', 'NNS', 'NNP', 'NNPS', 'PRP', 'PRP$', 'IN'}

# Stopwords to filter out (common words like 'we', 'the', etc.)
stop_words = set(stopwords.words('english'))

def filter_descriptors(descriptors):
    """
    Filter out non-opinion words, repetitive descriptors, and stopwords.
    Returns a list of valid descriptors for sentiment analysis.
    """
    # Remove duplicates from descriptors
    unique_descriptors = set(descriptors)

    filtered_descriptors = []
    for word in unique_descriptors:
        # Remove stopwords and words with irrelevant POS tags (nouns, pronouns, prepositions, etc.)
        if word.lower() not in stop_words and nltk.pos_tag([word])[0][1] not in non_opinion_pos_tags:
            filtered_descriptors.append(word)

    return filtered_descriptors

def classify_descriptor_sentiment(descriptors):
    """
    Classify sentiment of descriptors using VADER.
    Returns a sentiment classification: positive, negative, or neutral.
    """
    sentiment_result = {'positive': 0, 'negative': 0, 'neutral': 0}
    
    filtered_descriptors = filter_descriptors(descriptors)
    
    for word in filtered_descriptors:
        # Analyze the sentiment of the word
        sentiment_score = sia.polarity_scores(word)
        
        # Classify sentiment based on VADER compound score
        if sentiment_score['compound'] >= 0.05:
            sentiment_result['positive'] += 1
        elif sentiment_score['compound'] <= -0.05:
            sentiment_result['negative'] += 1
        else:
            sentiment_result['neutral'] += 1
    
    return sentiment_result

def calculate_sentiment_score(sentiment_counts):
    """
    Calculate the score for each category (Food, Ambience, Service) based on sentiment classification.
    Formula:
        score = (positive - negative) / total descriptors
    """
    total = sum(sentiment_counts.values())  # Total descriptors (positive + negative + neutral)
    
    # Handle the case where no descriptors are available
    if total == 0:
        return {'positive_score': 0, 'negative_score': 0, 'neutral_score': 0}
    
    positive_score = sentiment_counts['positive'] / total
    negative_score = sentiment_counts['negative'] / total
    neutral_score = sentiment_counts['neutral'] / total
    
    return {'positive_score': positive_score, 'negative_score': negative_score, 'neutral_score': neutral_score}

def normalize_scores(results):
    """
    Normalize the sentiment scores for each category (Food, Ambience, Service)
    so that they are comparable across businesses.
    """
    # Find the maximum descriptor count across all businesses for each category
    max_scores = {
        'positive_score': 0,
        'negative_score': 0,
        'neutral_score': 0
    }
    
    # Find maximum scores for normalization
    for business, categories in results.items():
        for category, sentiment in categories.items():
            # Handle the case where polarity_scores are missing or incomplete
            if 'positive_score' in sentiment and 'negative_score' in sentiment and 'neutral_score' in sentiment:
                max_scores['positive_score'] = max(max_scores['positive_score'], sentiment['positive_score'])
                max_scores['negative_score'] = max(max_scores['negative_score'], sentiment['negative_score'])
                max_scores['neutral_score'] = max(max_scores['neutral_score'], sentiment['neutral_score'])

    # Normalize the scores for each business
    for business, categories in results.items():
        for category, sentiment in categories.items():
            # Ensure we do not divide by zero
            sentiment['positive_score'] /= max_scores['positive_score'] if max_scores['positive_score'] != 0 else 1
            sentiment['negative_score'] /= max_scores['negative_score'] if max_scores['negative_score'] != 0 else 1
            sentiment['neutral_score'] /= max_scores['neutral_score'] if max_scores['neutral_score'] != 0 else 1
    
    return results


def adjust_polarity_scores(polarity_scores, neutral_weight_factor=0.5):
    """
    Adjust the neutral score and redistribute the remaining weight
    proportionally to positive and negative scores.
    """
    # Adjust the neutral score
    adjusted_neutral_score = polarity_scores["neutral_score"] * neutral_weight_factor

    # Calculate the remaining weight for positive and negative scores
    remaining_weight = 1 - adjusted_neutral_score
    total_positive_negative = polarity_scores["positive_score"] + polarity_scores["negative_score"]

    if total_positive_negative == 0:
        # If no positive or negative scores, assign all remaining weight to neutral
        return {
            "positive_score": 0,
            "negative_score": 0,
            "neutral_score": adjusted_neutral_score
        }

    # Calculate the ratios for positive and negative scores
    positive_ratio = polarity_scores["positive_score"] / total_positive_negative
    negative_ratio = polarity_scores["negative_score"] / total_positive_negative

    # Redistribute the remaining weight
    adjusted_positive_score = remaining_weight * positive_ratio
    adjusted_negative_score = remaining_weight * negative_ratio

    # Return the adjusted scores
    return {
        "positive_score": adjusted_positive_score,
        "negative_score": adjusted_negative_score,
        "neutral_score": adjusted_neutral_score,
    }



def update_polarity_scores(business_data, neutral_weight_factor=0.5):
    """
    Process each business, calculate and adjust polarity scores, and update them in MongoDB.
    """
    for business in business_data:
        feature_classification = business['feature_classification']

        # Iterate over each category for sentiment calculation
        for category, category_data in feature_classification.items():
            features = category_data['features']
            descriptors = category_data['descriptors']

            # Classify sentiment for the descriptors
            sentiment_counts = classify_descriptor_sentiment(descriptors)

            # Calculate initial sentiment scores
            sentiment_scores = calculate_sentiment_score(sentiment_counts)

            # Adjust the sentiment scores using the provided logic
            adjusted_scores = adjust_polarity_scores(sentiment_scores, neutral_weight_factor)

            # Insert adjusted polarity scores into the MongoDB document for this category
            collection.update_one(
                {"_id": business["_id"]},
                {"$set": {
                    f"feature_classification.{category}.polarity_scores": adjusted_scores
                }}
            )
            #print(f"Inserted adjusted polarity scores for {category} in {business['business_name']}")


# Fetch a limited number of documents from the collection
business_data = list(collection.find().sort("_id", -1))

# Update adjusted polarity scores for the fetched businesses
update_polarity_scores(business_data, neutral_weight_factor=0.5)




[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\sam\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sam\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
