In [27]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import numpy as np

In [32]:
data = pd.read_csv('Raw/merged.csv')
data.shape
np.random.seed(29)
index = np.random.randint(low=0, high=len(data), size=5000)

In [33]:
dat = data.iloc[index]
# Combine features into a single feature for TF-IDF
dat['combined_features'] = dat['hotel_experience'] + ' ' + dat['user_experience'] + ' ' + dat['user_review']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat['combined_features'] = dat['hotel_experience'] + ' ' + dat['user_experience'] + ' ' + dat['user_review']


In [36]:
# Drop rows with missing values in the combined_features column
dat = dat.dropna(subset=['combined_features'])

In [37]:
#split into training and testing
train = dat.iloc[1: len(dat)]
test = dat.iloc[0]

In [38]:
#cosine similarity task
# Initialize and fit the TF-IDF vectorizer, using feature made of words
vectorizer = TfidfVectorizer(analyzer='word')
tfidf_matrix = vectorizer.fit_transform(train['combined_features'].values)

# Calculate cosine similarities for the original dataset
cosine_similarities = cosine_similarity(tfidf_matrix)

In [39]:
#Save the trained vectorizer for future use
with open('vectorizer_test.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

In [46]:
# Load the previously trained vectorizer
with open('vectorizer_test.pkl', 'rb') as vectorizer_file:
    loaded_vectorizer = pickle.load(vectorizer_file)


testing_data = 'Lovely place'
# Fetch the corresponding hotel_experience and user_experience values
new_hotel_experience = 'Excellent'
new_user_experience = 'outstanding'

# Create the combined feature for the new entry
new_entry_combined = f"{new_hotel_experience} {new_user_experience} {testing_data}"

# Transform new text reviews into TF-IDF vectors using the existing vocabulary
new_tfidf_matrix = loaded_vectorizer.transform([testing_data])

# Calculate cosine similarities between new and old data
cosine_similarities_new = cosine_similarity(new_tfidf_matrix, tfidf_matrix)

In [47]:
most_similar_indices = np.argmax(cosine_similarities_new, axis=1)

In [48]:
#most_similar_indices = np.argmax(cosine_similarities_new, axis=1)

# Display the most similar entries from the original dataset for each entry in the new dataset
for i, new_entry in enumerate([testing_data]):
    most_similar_index = most_similar_indices[i]
    most_similar_entry = train.values[most_similar_index]
    print(f"New Entry: {new_entry}")
    print(f"Most Similar Entry: {most_similar_entry}")
    print(f"Cosine Similarity: {cosine_similarities_new[i, most_similar_index]}")
    

New Entry: Lovely place
Most Similar Entry: [18 "L'Hermitage Hotel" '5' 'Excellent'
 "['pool', 'fitness centre with gym / workout room', 'free high speed internet (wifi)', 'bar/lounge', 'room service', 'business centre with internet access', 'concierge', 'conference facilities', 'dry cleaning', 'heated pool', 'hot tub', 'laundry service', 'meeting rooms', 'multilingual staff', 'non-smoking hotel', 'outdoor pool', 'pets allowed ( dog / pet friendly )', 'wheelchair access', 'air conditioning', 'refrigerator in room', 'accessible rooms', 'family rooms', 'kitchenette', 'non-smoking rooms', 'suites']"
 '788 Richards Street Vancouver British Columbia' 'Canada' 5 'Outstanding'
 'Govinda R'
 'Lovely Place Its a lovely place to enjoy with the family, in one of the best areas of Vancouver, close to Library, with the best restaurants around, the BC Place and the rogers arena, the best choice for any sports fan.'
 'Nov. 2018' ' November 2018'
 'Excellent Outstanding Lovely Place Its a lovely place