In [None]:
import nltk
from nltk import pos_tag, word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter, defaultdict
from itertools import combinations
import spacy
from pymongo import MongoClient

# Ensure NLTK resources are downloaded
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Initialize spaCy model
nlp = spacy.load('en_core_web_sm')  # Load spaCy's small English model

# Step 1: Extract Nouns from POS Tags
def extract_nouns(pos_tags):
    """Extract relevant nouns (NN, NNS, NNP) with filtering rules."""
    nouns = []
    for word, tag in pos_tags:
        if tag in {'NN', 'NNS'}:
            nouns.append(word)
        elif tag == 'NNP':  # Handle proper nouns
            nouns.append(word.lower())
    return nouns

# Step 2: Frequent Feature Extraction
def apriori_frequent_itemsets(baskets, min_support):
    """Extract frequent itemsets using the Apriori algorithm."""
    counter = Counter()
    for basket in baskets:
        for size in range(1, 4):  # Single, two-word, three-word itemsets
            for itemset in combinations(basket, size):
                counter[itemset] += 1
    return {item: count for item, count in counter.items() if count >= min_support}

# Step 3: Get Top Frequent Features
def get_top_frequent_features(itemsets, top_n=5):
    """Get the top N most frequent features from itemsets."""
    sorted_itemsets = sorted(itemsets.items(), key=lambda x: x[1], reverse=True)
    return dict(sorted_itemsets[:top_n])

# Step 4: Extract Descriptors
def extract_descriptors(sentences, frequent_features):
    """Extract descriptors for features based on dependency parsing."""
    freq_features = defaultdict(lambda: {"count": 0, "examples": [], "descriptors": set()})

    for i, (sentence_id, words, pos_tags, sentence) in enumerate(sentences):
        doc = nlp(sentence)  # Parse the sentence using spaCy
        for token in doc:
            # Look for descriptors related to features (adjectives or subjects)
            if token.dep_ in ('amod', 'nsubj', 'advmod', 'compound'):
                opinion_word = token.text.lower()  # This is the descriptor word
                feature_word = token.head.text  # This is the feature word

                # Match feature_word to frequent features
                matched = False
                for feature in frequent_features:
                    feature_lower = [f.lower() for f in feature]
                    if feature_word.lower() in feature_lower:
                        matched = True
                        break

                # If the feature is matched, add the descriptor
                if matched:
                    freq_features[feature[0]]["descriptors"].add(opinion_word)
                    freq_features[feature[0]]["count"] += 1
                    freq_features[feature[0]]["examples"].append(" ".join([t.text for t in doc]))

            # Capture multi-word phrases (e.g., "very delicious")
            if token.dep_ == 'advmod' and token.head.dep_ == 'amod':
                descriptor_phrase = f"{token.text} {token.head.text}".lower()
                feature_word = token.head.head.text
                for feature in frequent_features:
                    feature_lower = [f.lower() for f in feature]
                    if feature_word.lower() in feature_lower:
                        freq_features[feature[0]]["descriptors"].add(descriptor_phrase)
                        freq_features[feature[0]]["count"] += 1
                        freq_features[feature[0]]["examples"].append(" ".join([t.text for t in doc]))
                        break

    return infrequent_features

# Step 5: Filter Descriptors and Features
def clean_descriptors_and_features(freq_features):
    """Clean descriptors and features by removing stopwords, single letters, and nouns."""
    stop_words = set(stopwords.words('english'))
    additional_stopwords = {"a", "an", "the", "of", "in", "on", "for", "to", "with", "by", "at", "as", "is", "it"}
    stop_words.update(additional_stopwords)

    cleaned_features = defaultdict(lambda: {"count": 0, "examples": [], "descriptors": set()})
    
    for feature, details in freq_features.items():
        # Remove nouns from descriptors
        filtered_descriptors = {
            word for word in details["descriptors"]
            if word not in stop_words and len(word) > 1 and word.isalpha()  # Remove stopwords, single letters, non-alphabetic words
        }

        # Perform POS tagging on descriptors to filter out nouns
        lemmatizer = WordNetLemmatizer()
        final_descriptors = set()
        for desc in filtered_descriptors:
            pos_tagged = pos_tag([desc])
            if pos_tagged[0][1] not in {'NN', 'NNS', 'NNP', 'NNPS'}:  # Exclude nouns
                final_descriptors.add(lemmatizer.lemmatize(desc))

        if final_descriptors:  # Only include features with valid descriptors
            cleaned_features[feature]["descriptors"] = final_descriptors
            cleaned_features[feature]["count"] = details["count"]
            cleaned_features[feature]["examples"] = details["examples"]
    
    return cleaned_features

# Step 6: Combine Results with Filtering
def process_reviews(preprocessed_data, min_support=2, top_n=20):
    """End-to-end processing to extract features and clean descriptors."""
    baskets = [extract_nouns(pos_tags) for _, _, pos_tags, _ in preprocessed_data]
    frequent_itemsets = apriori_frequent_itemsets(baskets, min_support)
    top_features = get_top_frequent_features(frequent_itemsets, top_n)
    freq_features = extract_descriptors(preprocessed_data, top_features)
    cleaned_features = clean_descriptors_and_features(freq_features)
    return top_features, cleaned_features

# Step 7: Update MongoDB
client = MongoClient("mongodb://localhost:27017/")  # Update with your MongoDB connection details
db = client["Project"]  # Replace with your database name
collection = db["pizza_business_preprocess_reviews"]  # Replace with your collection name

for document in collection.find():  # Retrieve all documents
    preprocessed_data = []
    if "preprocessed_data" in document:
        preprocessed_data.extend(document["preprocessed_data"])  # Accumulate preprocessed data
        business_id = document.get("business_id", "Unknown")  # Get the business_id (replace with actual field if different)

    if preprocessed_data:
        # Process reviews and clean features
        frequent_features, cleaned_features = process_reviews(preprocessed_data)

        # Prepare cleaned features with descriptors
        extracted_features = []
        for feature, descriptors in cleaned_features.items():
            extracted_features.append({
                "feature": feature,
                "descriptors": list(descriptors["descriptors"])
            })
        
        # Push the cleaned features to the collection
        collection.update_one(
            {"business_id": business_id}, 
            {"$set": {"extracted_features": extracted_features}},
            upsert=True  # If document doesn't exist, it will be created
        )
        print(business_id)

    else:
        print("No data found in the collection.")