Sentiment Enhanced Food Search Engine - 'Favorfeel'

Data Preprocessing
1. Text Preprocessing:
2. Convert text to lowercase.
3. Remove punctuation and special characters.
4. Tokenization: Split text into words.
5. Remove stop words.
6. Lemmatization or stemming.

In [73]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

def text_preprocessing(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove punctuation and special characters
    tokens = word_tokenize(text)  # Tokenization
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]  # Remove stop words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    return ' '.join(tokens)

# Load dataset
# from google.colab import drive
# drive.mount('/content/drive')
# data = pd.read_csv('/content/drive/MyDrive/Implementation 2024 Final Year Project/output_1.csv')
data = pd.read_csv('output_1.csv')

# Apply text preprocessing to 'Review' column
data['Comment'] = data['Comment'].apply(text_preprocessing)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [74]:
# print preprocessed data

print(data['Comment'].tail())


4922    wifi yesambiance 7 10taste meter 6 5 10price e...
4923    love first bite since 1st visit place everytim...
4924    one best place burger love place taste staff p...
4925    sauce biggies us spectacular burger worth mone...
4926    exquisite place burger ambience unique kid fri...
Name: Comment, dtype: object


# Build Vocabulary:
Create a vocabulary from the tokenized words in the reviews.

In [75]:
from collections import Counter

# Build vocabulary
all_words = ' '.join(data['Comment']).split()
word_counts = Counter(all_words)
vocabulary = list(word_counts.keys())

# Vectorization:
Convert text into numerical format using techniques like Bag of Words or TF-IDF.

In [76]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorization using TF-IDF
tfidf_vectorizer = TfidfVectorizer(vocabulary=vocabulary)
X = tfidf_vectorizer.fit_transform(data['Comment'])
y = data['Rating']

In [77]:
X

<4927x9086 sparse matrix of type '<class 'numpy.float64'>'
	with 109640 stored elements in Compressed Sparse Row format>

In [78]:
y

0       5.0
1       3.0
2       1.0
3       4.0
4       1.0
       ... 
4922    4.0
4923    5.0
4924    5.0
4925    5.0
4926    5.0
Name: Rating, Length: 4927, dtype: float64

# Handle Imbalanced Dataset:
Since the dataset is imbalanced, we can use techniques like oversampling or undersampling.

In [79]:
print(y.dtype)

float64


In [80]:
print(y.tail())

4922    4.0
4923    5.0
4924    5.0
4925    5.0
4926    5.0
Name: Rating, dtype: float64


In [81]:
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [82]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y_encoded)

In [83]:
# from imblearn.over_sampling import RandomOverSampler

# # Handle imbalanced dataset using RandomOverSampler
# ros = RandomOverSampler(random_state=42)
# X_resampled, y_resampled = ros.fit_resample(X, y)

# Model Building

In [84]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
%pip install seaborn
import seaborn as sns

# Split the dataset
x_train, x_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2)




In [85]:
x_test

<2376x9086 sparse matrix of type '<class 'numpy.float64'>'
	with 76470 stored elements in Compressed Sparse Row format>

In [86]:
x_test

<2376x9086 sparse matrix of type '<class 'numpy.float64'>'
	with 76470 stored elements in Compressed Sparse Row format>

In [87]:
from sklearn import svm
from sklearn.metrics import roc_auc_score

# Create the SVC model with probability=True
model = svm.SVC(probability=True)

# Fit the model
model.fit(x_train, y_train)

# ROC/AUC Curve
y_probs = model.predict_proba(x_test)
macro_roc_auc_ovr = roc_auc_score(y_test, y_probs, multi_class="ovr", average="macro")
print(f"ROC AUC Score (Macro): {macro_roc_auc_ovr}")

ROC AUC Score (Macro): 0.9891571844771048


Get Predictions

To Get predictions, we're using most accurate model, Support Vector Machine (SVM)


In [88]:
# Predict on new datanew_comment = "The food was delicious and the service was great!"

new_comment = "The Cheese Kotthu is a standout, arguably the best in Colombo."
new_comment_preprocessed = text_preprocessing(new_comment)
new_comment_vectorized = tfidf_vectorizer.transform([new_comment_preprocessed])
predictions = {}
for model_name, model in models.items():
    predictions[model_name] = model.predict(new_comment_vectorized)
for model_name, prediction in predictions.items():
    print(f"Predictions for {model_name}: {le.inverse_transform(prediction)}")

Predictions for Naive Bayes: [5.]
Predictions for Logistic Regression: [5.]
Predictions for Decision Tree: [5.]
Predictions for Random Forest: [5.]
Predictions for Support Vector Machine: [5.]


In [89]:
# prompt: Give the code to get predictions

# Predict on new datanew_comment = "The food was delicious and the service was great!"
new_comment_preprocessed = text_preprocessing(new_comment)
new_comment_vectorized = tfidf_vectorizer.transform([new_comment_preprocessed])

prediction = model.predict(new_comment_vectorized)
print(f"Predicted Rating: {le.inverse_transform(prediction)}")


Predicted Rating: [5.]


In [90]:
%pip install joblib
import joblib

# Save the SVM model as a pickle file
joblib.dump(model, 'svm_model.pkl')




['svm_model.pkl']

### Define Variables

In [91]:
data_csv_file = 'data.csv'

### Import Required Libraries

In [92]:
import pandas as pd
import json

### Read CSV


In [93]:
df = pd.read_csv(data_csv_file,encoding="latin1",usecols=range(5))
df.head()

Unnamed: 0,Restaurant Name,Location,Foods,Keywords,Comments
0,BARS cafe,"6.898030321645603, 79.8700224285028",Rice and Curry,"rice, rice and curry, Rice, Curry with rice, C...",I have been buying their rice and curry for a ...
1,BARS cafe,"6.898030321645603, 79.8700224285028",Kottu,"kottu, kothtu, kotthu, kottu roti, Kottu, KOTT...",Food is good but expensive
2,BARS cafe,"6.898030321645603, 79.8700224285028",Kottu,"kottu, kothtu, kotthu, kottu roti, Kottu, KOTT...",Worst food. Ordered kottu. It smelled awful.
3,BARS cafe,"6.898030321645603, 79.8700224285028",Kottu,"kottu, kothtu, kotthu, kottu roti, Kottu, KOTT...",They have a delicious range of kottus like gri...
4,BARS cafe,"6.898030321645603, 79.8700224285028",Egg Fried Rice,"Egg, Fried, Rice, Fried Rice, Egg fried rice","I had the worst experience ever, I ordered egg..."


### Load ML Model


In [94]:
svm_model = None
with open('svm_model.pkl', 'rb') as f:
    svm_model = joblib.load(f)

### Build Basic JSON

In [95]:
foods=[]
id=0
for index, row in df.iterrows():
  food_name = row[2]
  keywords = row[3]

  # Add Food If Not Exists
  already_has_food_name = False
  for food in foods:
    if food_name == food["food_name"]:
      already_has_food_name=True
      break

  if not already_has_food_name:
    foods.append({
        "food_id":id,
        "food_name":food_name,
        "keywords":[word.strip() for word in str(keywords).split(",")],
        "shops":[]
    })
    id+=1


  # Select Specific Food
  current_food = None
  for food in foods:
    if food_name == food["food_name"]:
      current_food = food
      break

  # Get Current Shop Name
  shop_name = row[0]
  coordinates = row[1]


  # Add Shop If Not Exists
  already_has_shop_name_in_food = False
  for shop in current_food["shops"]:
    if shop_name == shop["shop_name"]:
      already_has_shop_name_in_food=True
      break

  if not already_has_shop_name_in_food:
    current_food["shops"].append({
        "shop_name":shop_name,
        "coordinates":[float(loc.strip()) for loc in str(coordinates).split(",")],
        "comments":[]
    })

  # Add Comments To Shop
  new_comment_preprocessed = text_preprocessing(row[4])
  new_comment_vectorized = tfidf_vectorizer.transform([new_comment_preprocessed])
  prediction = svm_model.predict(new_comment_vectorized)
  sentimental_rating = le.inverse_transform(prediction)
  for shop in current_food["shops"]:
    if shop_name == shop["shop_name"]:
      shop["comments"].append(
          {
              "text":row[4],
              "sentimental_rating": sentimental_rating[0]
          }
      )


### Use Model And Build JSON With Overall Sentimental Ratings



In [96]:
for food in foods:
  for shop in food["shops"]:
    ratings = []
    for comment in shop["comments"]:
      ratings.append(comment["sentimental_rating"])
    shop["overall_sentimental_rating"]= np.mean(ratings)


print(json.dumps(foods))
with open("output.json", 'w') as json_file:
    json.dump(foods, json_file)

[{"food_id": 0, "food_name": "Rice and Curry", "keywords": ["rice", "rice and curry", "Rice", "Curry with rice", "Curry", "bath"], "shops": [{"shop_name": "BARS cafe", "coordinates": [6.898030321645603, 79.8700224285028], "comments": [{"text": "I have been buying their rice and curry for a while, and the food has always been delicious. They currently sell at 550, which is reasonable compared to other shops in the area. Only card payments are accepted.", "sentimental_rating": 5.0}], "overall_sentimental_rating": 5.0}, {"shop_name": "Hotel De Plaza", "coordinates": [6.9101334265701615, 79.85121759508941], "comments": [{"text": "Good place to hv non veg dinner.   Cost wise very worth.", "sentimental_rating": 4.0}], "overall_sentimental_rating": 4.0}, {"shop_name": "Palmyrah Restaurant", "coordinates": [6.911921541358405, 79.85039971705802], "comments": [{"text": "There is a superb selection of rice, curries, sambols and of course appetizers.  In addition, there are mouthwatering desserts.