In [25]:
import nltk
from nltk.corpus import wordnet as wn
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import pickle

# Download WordNet data (run once)
nltk.download('wordnet')

# Function to extract hypernyms up to the "entity" level, with weighting
def get_weighted_hypernyms(word, decay_factor=0.8):
    synsets = wn.synsets(word)
    weighted_hypernyms = {}
    
    for synset in synsets:
        depth = 0
        current = synset
        while current.hypernyms():  # Traverse hypernym tree
            current = current.hypernyms()[0]
            weight = decay_factor ** depth  # Apply exponential decay
            weighted_hypernyms[current.name()] = weighted_hypernyms.get(current.name(), 0) + weight
            depth += 1
            if "entity" in current.name():
                break
    
    return weighted_hypernyms

annotated_words = {'addition': 'Ambience', 'staff':'Service','afternoon': 'Ambience', 'amount': 'Ambience', 'are': 'Ambience', 'area': 'Ambience', 'atmosphere': 'Ambience', 'attitude': 'Service', 'attitudes': 'Service', 'awesome': 'Food', 'bacon': 'Food', 'bar': 'Ambience', 'bartenders': 'Service', 'basil': 'Food', 'basis': 'Service', 'baskets': 'Food', 'batter': 'Food', 'beef': 'Food', 'beer': 'Food', 'bell': 'Ambience', 'bit': 'Food', 'bite': 'Food', 'block': 'Ambience', 'body': 'Ambience', 'bonus': 'Ambience', 'booing': 'Ambience', 'booth': 'Ambience', 'bowl': 'Food', 'box': 'Ambience', 'bread': 'Food', 'breathing': 'Service', 'brewery': 'Ambience', 'burger': 'Food', 'business': 'Ambience', 'caesar': 'Food', 'card': 'Service', 'chain': 'Ambience', 'chains': 'Ambience', 'chances': 'Ambience', 'charm': 'Ambience', 'cheese': 'Food', 'chicken': 'Food', 'choice': 'Food', 'coatings': 'Food', 'combination': 'Food', 'combo': 'Food', 'confusion': 'Service', 'connoisseur': 'Ambience', 'consistency': 'Food', 'container': 'Food', 'cost': 'Service', 'craft': 'Food', 'craving': 'Food', 'crisp': 'Food', 'critique': 'Service', 'crowd': 'Ambience', 'crunch': 'Food', 'crust': 'Food', 'customer': 'Service', 'customers': 'Service', 'date': 'Ambience', 'day': 'Ambience', 'days': 'Ambience', 'deal': 'Service', 'decades': 'Ambience', 'decision': 'Service', 'delicious': 'Food', 'deliciousness': 'Food', 'delivery': 'Service', 'details': 'Service', 'dinner': 'Food', 'dirty': 'Ambience', 'dish': 'Food', 'dishes': 'Food', 'distance': 'Ambience', 'distancing': 'Service', 'door': 'Ambience', 'dough': 'Food', 'doughy': 'Food', 'downtown': 'Ambience', 'dressing': 'Food', 'drink': 'Food', 'drinks': 'Food', 'dump': 'Ambience', 'dust': 'Ambience', 'employees': 'Service', 'ending': 'Service', 'energy': 'Service', 'enthusiasm': 'Service', 'environment': 'Ambience', 'essence': 'Ambience', 'establishment': 'Ambience', 'establishments': 'Ambience', 'evening': 'Ambience', 'experience': 'Service', 'experiences': 'Service', 'facilities': 'Ambience', 'family': 'Service', 'fan': 'Ambience', 'favorite': 'Food', 'female': 'Service', 'find': 'Service', 'flair': 'Service', 'flavor': 'Food', 'flavors': 'Food', 'floor': 'Ambience', 'food': 'Food', 'foods': 'Food', 'frame': 'Ambience', 'friends': 'Service', 'fries': 'Food', 'game': 'Ambience', 'garden': 'Food', 'garlic': 'Food', 'gem': 'Ambience', 'giardiniera': 'Food', 'girl': 'Service', 'girls': 'Service', 'grace': 'Service', 'grease': 'Food', 'greens': 'Food', 'group': 'Service', 'guest': 'Service', 'guy': 'Service', 'hand': 'Service', 'hill': 'Ambience', 'honesty': 'Service', 'hospitality': 'Service', 'hour': 'Ambience', 'hours': 'Service', 'hut': 'Ambience', 'impression': 'Service', 'individuals': 'Service', 'ingredient': 'Food', 'ingredients': 'Food', 'instances': 'Service', 'instructions': 'Service', 'investment': 'Service', 'irritation': 'Service', 'is': 'Ambience', 'island': 'Ambience', 'issue': 'Service', 'issues': 'Service', 'items': 'Food', 'jem': 'Ambience', 'job': 'Service', 'jobs': 'Service', 'joints': 'Food', 'kinds': 'Food', 'knots': 'Food', 'ladies': 'Service', 'lady': 'Service', 'layer': 'Food', 'lesson': 'Food', 'lettuce': 'Food', 'libation': 'Food', 'life': 'Service', 'lighting': 'Ambience', 'location': 'Ambience', 'locations': 'Ambience', 'looked': 'Ambience', 'looking': 'Ambience', 'mabry': 'Ambience', 'man': 'Service', 'management': 'Service', 'manager': 'Service', 'manner': 'Service', 'manning': 'Service', 'marty': 'Service', 'meal': 'Food', 'meals': 'Food', 'meat': 'Food', 'meatballs': 'Food', 'melty': 'Food', 'memories': 'Service', 'merit': 'Service', 'mess': 'Ambience', 'message': 'Service', 'min': 'Service', 'minute': 'Service', 'minutes': 'Service', 'mix': 'Food', 'money': 'Service', 'months': 'Service', 'mozza': 'Food', 'mushrooms': 'Food', 'name': 'Service', 'neber': 'Service', 'night': 'Ambience', 'nights': 'Ambience', 'notch': 'Ambience', 'note': 'Service', 'number': 'Service', 'ny': 'Service', 'occasion': 'Service', 'oil': 'Food', 'olives': 'Food', 'one': 'Service', 'ones': 'Service', 'onion': 'Food', 'option': 'Food', 'options': 'Food', 'order': 'Service', 'ordering': 'Service', 'orders': 'Service', 'oregano': 'Food', 'owner': 'Service', 'parlor': 'Ambience', 'part': 'Service', 'patience': 'Service', 'people': 'Service', 'pepper': 'Food', 'pepperoni': 'Food', 'peppers': 'Food', 'person': 'Service', 'phone': 'Service', 'pic': 'Service', 'pie': 'Food', 'pieces': 'Food', 'pizza': 'Food', 'pizzas': 'Food', 'pizzerias': 'Food', 'place': 'Ambience', 'placement': 'Ambience', 'places': 'Ambience', 'plans': 'Service', 'plenty': 'Service', 'plus': 'Service', 'pockets': 'Service', 'pork': 'Food', 'posts': 'Service', 'practices': 'Service', 'price': 'Food', 'prices': 'Food', 'privilege': 'Service', 'problem': 'Service', 'procedures': 'Service', 'puff': 'Food', 'purchases': 'Service', 'quality': 'Food', 'rate': 'Service', 'rating': 'Service', 'raviolis': 'Food', 'reason': 'Service', 'regulations': 'Service', 'restaurants': 'Food', 'review': 'Service', 'reviewer': 'Service', 'reviews': 'Service', 'right': 'Food', 'room': 'Ambience', 'rustys': 'Ambience', 'salad': 'Food', 'sandwiches': 'Food', 'sauce': 'Food', 'sausage': 'Food', 'school': 'Service', 'secret': 'Food', 'section': 'Ambience', 'sections': 'Ambience', 'selection': 'Food', 'self': 'Service', 'server': 'Service', 'service': 'Service', 'shame': 'Service', 'shelf': 'Ambience', 'show': 'Ambience', 'sicilian': 'Food', 'side': 'Ambience', 'situation': 'Ambience', 'size': 'Ambience', 'slice': 'Food', 'slices': 'Food', 'snack': 'Food', 'space': 'Ambience', 'specials': 'Food', 'spicy': 'Food', 'steak': 'Food', 'sticky': 'Food', 'stone': 'Ambience', 'style': 'Ambience', 'suggestion': 'Service', 'taste': 'Food', 'texture': 'Food', 'toppings': 'Food', 'tomatoes': 'Food', 'tofu': 'Food', 'turkey': 'Food', 'uniform': 'Service', 'utensils': 'Service', 'viper': 'Service', 'vibe': 'Ambience', 'waitress': 'Service', 'waiting': 'Service', 'waiter': 'Service', 'wine': 'Food', 'wings': 'Food', 'woman': 'Service', 'workers': 'Service', 'year': 'Service', 'young': 'Service', 'youth': 'Service'}

# Prepare training data with hypernyms and labels
training_data = []
for word, label in annotated_words.items():
    hypernyms = get_weighted_hypernyms(word)
    training_data.append((hypernyms, label))

# Convert training data into features and labels
features = []
labels = []

for hypernyms, label in training_data:
    # Join weighted hypernyms into a string for feature extraction
    weighted_features = [f"{name}:{weight:.2f}" for name, weight in hypernyms.items()]
    features.append(" ".join(weighted_features))
    labels.append(label)

# Convert labels into numerical format
label_map = {"Food": 0, "Ambience": 1, "Service": 2}
y = np.array([label_map[label] for label in labels])

# Function to define custom tokenizer
def custom_tokenizer(text):
    return text.split()

# Convert features into a bag-of-words model
vectorizer = CountVectorizer(tokenizer=custom_tokenizer)
X = vectorizer.fit_transform(features)

# Handle class imbalance by computing class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

# Train the Multinomial Naive Bayes model with class weights
model = MultinomialNB()
model.fit(X, y)

# Save the model and vectorizer
with open('model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)
with open('vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

print("Model and vectorizer have been saved to 'model.pkl' and 'vectorizer.pkl'.")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sam\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Model and vectorizer have been saved to 'model.pkl' and 'vectorizer.pkl'.


In [7]:
with open('model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

with open('vectorizer.pkl', 'rb') as vectorizer_file:
    loaded_vectorizer = pickle.load(vectorizer_file)

print("Model and vectorizer loaded successfully.")


Model and vectorizer loaded successfully.


# inserting into collection

In [28]:
from pymongo import MongoClient
import pickle

# Load the saved model and vectorizer
with open('model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

with open('vectorizer.pkl', 'rb') as vectorizer_file:
    loaded_vectorizer = pickle.load(vectorizer_file)

print("Model and vectorizer loaded successfully.")

# Function to classify features
def classify_business_features(business_id, business_name, extracted_features, model, vectorizer):
    classifications = {
        "Food": {"features": [], "descriptors": []},
        "Service": {"features": [], "descriptors": []},
        "Ambience": {"features": [], "descriptors": []},
    }

    for feature_pair in extracted_features:
        feature = feature_pair["feature"]
        descriptors = feature_pair.get("descriptors", [])
        words = feature.split()

        # Extract hypernyms and create a feature vector
        combined_hypernyms = {}
        for word in words:
            hypernyms = get_weighted_hypernyms(word)
            for name, weight in hypernyms.items():
                combined_hypernyms[name] = combined_hypernyms.get(name, 0) + weight

        if combined_hypernyms:
            weighted_features = [f"{name}:{weight:.2f}" for name, weight in combined_hypernyms.items()]
            feature_vector = vectorizer.transform([" ".join(weighted_features)])
            predicted_label = model.predict(feature_vector)[0]

            # Map predicted label to category
            label_map = {0: "Food", 1: "Ambience", 2: "Service"}
            category = label_map.get(predicted_label, "Unknown")

            # Add feature and descriptors to the classification if it's not "Unknown"
            if category != "Unknown":
                classifications[category]["features"].append(feature)
                classifications[category]["descriptors"].extend(descriptors)

    # Return the classification schema
    return {
        "business_id": business_id,
        "business_name": business_name,
        "feature_classification": classifications,
    }

# MongoDB connection
client = MongoClient('mongodb://localhost:27017/')
db = client['Project']  # Replace with your database name
source_collection = db['pizza_business_preprocess_reviews']  # Source collection
target_collection = db['pizza_business_feature_classification']  # Target collection

# Process each record in the source collection
cursor = source_collection.find({}, {'business_id': 1, 'business_name': 1, 'extracted_features': 1})
for record in cursor:
    business_id = record.get("business_id")
    business_name = record.get("business_name")
    extracted_features = record.get("extracted_features", [])

    # Classify the features
    classified_data = classify_business_features(
        business_id,
        business_name,
        extracted_features,
        loaded_model,
        loaded_vectorizer
    )

    # Insert classified data into the target collection
    target_collection.insert_one(classified_data)

print("Feature classification and insertion completed.")


Model and vectorizer loaded successfully.
Feature classification and insertion completed.
