In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import numpy as np

In [2]:
hotel_data = pd.read_csv('../Data/Raw/Hotels/Hotel.csv')
reviews_data = pd.read_csv('../Data/Raw/Hotels/Hotel_Reviews.csv')
# Merge the datasets using a common column, e.g., 'hotel_id'
merged_data = pd.merge(reviews_data, hotel_data, on='id', how='inner')

In [3]:
train = merged_data.iloc[1:]  
test = merged_data.iloc[0]

In [4]:
#cosine similarity task
# Initialize and fit the TF-IDF vectorizer, using feature made of words
vectorizer = TfidfVectorizer(analyzer='word')
tfidf_matrix = vectorizer.fit_transform(train['user_review'].values)


# Calculate cosine similarities for the original dataset
cosine_similarities = cosine_similarity(tfidf_matrix)

In [5]:
#Save the trained vectorizer for future use
with open('vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

In [15]:
# Load the previously trained vectorizer
with open('vectorizer.pkl', 'rb') as vectorizer_file:
    loaded_vectorizer = pickle.load(vectorizer_file)



testing_data = "Simply the BEST This was my wife and my second visit in just over a year and simply put its the best. Our hosts Fred and Rowena as usual were cordial and caring, respectful when need be and lively conversationests when invited. The acomidations are spotless and comfortable, a home away from."
# Transform new text reviews into TF-IDF vectors using the existing vocabulary
new_tfidf_matrix = loaded_vectorizer.transform([testing_data])

# Calculate cosine similarities between new and old data
cosine_similarities_new = cosine_similarity(new_tfidf_matrix, tfidf_matrix)

In [16]:
most_similar_indices = np.argsort(cosine_similarities_new, axis=1)[:, -3:]

In [13]:
most_similar_indices

array([[ 5523, 27142, 10288]])

In [17]:

# Display the most similar entries from the original dataset for each entry in the new dataset
for i, new_entry in enumerate([testing_data]):
    most_similar_index = most_similar_indices[i]
    most_similar_entry = train.values[most_similar_index]
    print(f"New Entry: {new_entry}")
    print(f"Most Similar Entry: {most_similar_entry}")
    print(f"Cosine Similarity: {cosine_similarities_new[i, most_similar_index]}")
    

New Entry: Simply the BEST This was my wife and my second visit in just over a year and simply put its the best. Our hosts Fred and Rowena as usual were cordial and caring, respectful when need be and lively conversationests when invited. The acomidations are spotless and comfortable, a home away from.
Most Similar Entry: [[18 5 'jacstar351' 'Simply the Best!!! L.' "L'Hermitage Hotel" 5.0
  'Excellent'
  "['Pool', 'Fitness Centre with Gym / Workout Room', 'Free High Speed Internet (WiFi)', 'Bar/Lounge', 'Room service', 'Business Centre with Internet Access', 'Concierge', 'Conference Facilities', 'Dry Cleaning', 'Heated pool', 'Hot Tub', 'Laundry Service', 'Meeting rooms', 'Multilingual Staff', 'Non-smoking hotel', 'Outdoor pool', 'Pets Allowed ( Dog / Pet Friendly )', 'Wheelchair Access', 'Air conditioning', 'Refrigerator in room', 'Accessible rooms', 'Family Rooms', 'Kitchenette', 'Non-smoking rooms', 'Suites']"
  '788 Richards Street Vancouver British Columbia' 'Canada' ' ' ' ' ' ']


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import numpy as np

data = pd.read_csv('Raw/merged.csv')

data.shape
np.random.seed(29)
index = np.random.randint(low=0, high=len(data), size=5000)

dat = data.iloc[index]

#split into training and testing
train = dat.iloc[1: len(dat)]
test = dat.iloc[0]

#cosine similarity task
# Initialize and fit the TF-IDF vectorizer, using feature made of words
vectorizer = TfidfVectorizer(analyzer='word')
tfidf_matrix = vectorizer.fit_transform(train['user_review'].values)

# Calculate cosine similarities for the original dataset
cosine_similarities = cosine_similarity(tfidf_matrix)

most_similar_indices = np.argmax(cosine_similarities_new, axis=1)

#most_similar_indices = np.argmax(cosine_similarities_new, axis=1)

# Display the most similar entries from the original dataset for each entry in the new dataset
for i, new_entry in enumerate([testing_data]):
    most_similar_index = most_similar_indices[i]
    most_similar_entry = train.values[most_similar_index]
    print(f"New Entry: {new_entry}")
    print(f"Most Similar Entry: {most_similar_entry}")
    print(f"Cosine Similarity: {cosine_similarities_new[i, most_similar_index]}")
    