In [1]:
from pymongo import MongoClient

# Connect to the MongoDB server
client = MongoClient("mongodb://localhost:27017/")

# Access the required database and collections
db = client["Project"]
business_collection = db["Business"]
review_collection = db["Review"]
business_review_collection = db["pizza_business_review"]

# Query the businesses with "Pizza" in their categories
pizza_businesses = business_collection.find({
    "$and": [
        {"categories": {"$regex": "Pizza", "$options": "i"}},
        {"name": {"$regex": "Pizza", "$options": "i"}}
    ]
})


# Iterate through the pizza businesses
for business in pizza_businesses:
    business_id = business["business_id"]
    business_name = business["name"]
    city = business.get("city", "")  # Handle missing city field gracefully
    state = business.get("state", "")  # Handle missing state field gracefully

    # Find reviews for the current business
    reviews = review_collection.find({"business_id": business_id})

    # Concatenate all review texts into a single paragraph
    review_texts = " ".join([review["text"] for review in reviews if "text" in review])

    # Create a document for the business_review collection
    business_review_doc = {
        "business_id": business_id,
        "business_name": business_name,
        "city": city,
        "state": state,
        "reviews": review_texts  # Single paragraph of reviews
    }

    # Insert the document into the business_review collection
    business_review_collection.insert_one(business_review_doc)

print("Business review data inserted successfully!")


Business review data inserted successfully!


In [2]:
from pymongo import MongoClient
import spacy
import re

# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['Project']
pizza_business_review_collection = db['pizza_business_review']
new_collection = db['pizza_business_processed_reviews']

# Step 1: Define the contractions dictionary
contractions_dict = {
    "don't": "do not",
    "can't": "cannot",
    "won't": "will not",
    "it's": "it is",
    "i'm": "i am",
    "you're": "you are",
    "they're": "they are",
    "we're": "we are",
    "isn't": "is not",
    "aren't": "are not",
    "wasn't": "was not",
    "weren't": "were not",
    "there's": "there is",
    "there're": "there are",
    "i'll": "i will",
    "you'll": "you will",
    "he'll": "he will",
    "she'll": "she will",
    "they'll": "they will",
    "we'll": "we will",
    "it's": "it is",
    "i've": "i have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "shouldn't": "should not",
    "wouldn't": "would not",
    "couldn't": "could not",
    "hasn't": "has not",
    "haven't": "have not",
    "hadn't": "had not",
    "mightn't": "might not",
    "mustn't": "must not",
    "let's": "let us",
    "that's": "that is",
    "what's": "what is",
    "who's": "who is",
    "here's": "here is",
}

# Function to expand contractions
def expand_contractions(text, contractions):
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in contractions.keys()) + r')\b')
    return pattern.sub(lambda x: contractions[x.group()], text)

# Function to clean and preprocess the reviews
def preprocess_reviews(review):
    # Step 1: Expand contractions
    review = expand_contractions(review, contractions_dict)
    
    # Step 2: Remove unnecessary characters and HTML tags
    review = re.sub(r"<.*?>", "", review)  # Remove HTML tags
    review = re.sub(r"[^a-zA-Z0-9\s.,!?']", "", review)  # Remove special characters (keep letters, numbers, essential punctuation)
    
    # Step 3: Convert to lowercase
    review = review.lower()
    
    # Step 4: Remove excessive whitespace
    review = re.sub(r"\s+", " ", review).strip()
    
    return review

# POS tagging to identify nouns and adjectives
def pos_tagging(review):
    doc = nlp(review)
    features = []
    opinions = []
    for token in doc:
        if token.pos_ == "NOUN":  # Features
            features.append(token.text)
        elif token.pos_ == "ADJ":  # Opinions
            opinions.append(token.text)
    return {"features": features, "opinions": opinions}

# Function to extract features and opinions based on dependency parsing
def extract_dependencies(review):
    doc = nlp(review)
    feature_opinion_pairs = []
    for token in doc:
        if token.dep_ == "amod" and token.head.pos_ == "NOUN":  # Adjective modifier to noun (Feature, Opinion)
            feature_opinion_pairs.append((token.head.text, token.text))
        elif token.dep_ == "nsubj" and token.pos_ == "ADJ":  # Subject-adjective (Feature, Opinion)
            feature_opinion_pairs.append((token.head.text, token.text))
    return feature_opinion_pairs

# Process each document in the pizza_business_review collection
for document in pizza_business_review_collection.find():
    # Check if "reviews" field exists
    if "reviews" in document:
        # Preprocess the concatenated reviews
        cleaned_review = preprocess_reviews(document["reviews"])
        
        # Apply POS tagging to extract features and opinions
        pos_tagged_data = pos_tagging(cleaned_review)
        
        # Extract feature-opinion pairs using dependency parsing
        feature_opinion_pairs = extract_dependencies(cleaned_review)
        
        # Combine results into one structure
        extracted_data = {
            "features_and_opinions": pos_tagged_data,  # POS tagging results
            "feature_opinion_pairs": feature_opinion_pairs  # Dependency parsing results
        }
        
        # Create the new document structure to insert into the new collection
        new_document = {
            "business_id": document["business_id"],
            "business_name": document["business_name"],
            "cleaned_review": cleaned_review,
            "extracted_data": extracted_data
        }
        
        # Insert the processed data into the new collection
        new_collection.insert_one(new_document)

print("Processing complete. Data inserted into the new collection.")


Processing complete. Data inserted into the new collection.
