In [24]:
#import pandas and numpy
import pandas as pd
import numpy as np

In [25]:
dataset = pd.read_csv("dataset.csv")

In [26]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Destination  30 non-null     object
 1   Description  30 non-null     object
dtypes: object(2)
memory usage: 608.0+ bytes


In [27]:
#We can compute the similarity between travel descriptions using TfidfVectorizer

#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
dataset["Description"] = dataset["Description"].fillna("")

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(dataset["Description"])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(30, 124)

In [28]:
#We will use this matrix to calculate the similarity score with linear kernel, cosine similarity and sigmoid score.
#content Based Filtering/ and collabrative filtering

# Import linear_kernel, cosine_similarity, and sigmoid_kernel
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import sigmoid_kernel

# Compute the cosine similarity matrix
linear = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
sig_score = sigmoid_kernel(tfidf_matrix, tfidf_matrix)

In [33]:
import pickle

with open("linear_model.pkl" , 'wb') as f:
    pickle.dump(linear, f)

In [29]:
#This matrix represents each product category's similarity score with every other product category.

#We need to define a function that takes the product name as an input and outputs a list of the 10 most similar products.
#For this we need a reverse mapping of products and DataFrame indices. This means we need a mechanism to identify the index of 
#a product in our DataFrame.

#Construct a reverse map of indices and product names
indices = pd.Series(dataset.index, index=dataset["Destination"])

In [30]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

choices = list(indices.index)

In [31]:
extracted = process.extract("lego", choices, limit=1)
extracted[0][0]

'Trincomalee Beach'

In [10]:
# Function that takes in prompt as input and outputs most similar product
def rec_lin(user_input, linear=linear):
    
    # use fuzzywuzzy to grab the product with name closest to user input
    extracted = process.extract(user_input, choices, limit=1)
    destination = extracted[0][0]
    
    # Get the index of the product that matches the product name
    idx = indices[destination]

    # Get the pairwise similarity scores
    sim_scores = list(enumerate(linear[idx]))

    # Sort the products based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar products
    sim_scores = sim_scores[1:11]

    # Get the product indices
    product_indices = [i[0] for i in sim_scores]

    df_return = dataset[["Destination"]].loc[product_indices]
    # Return the top 10 most similar products
    return df_return

In [23]:
rec_lin("I like elephants")

Unnamed: 0,Destination
1,Yala National Park
10,Uda walawa National Park
8,Wasgamuwa national Park
16,Minneriya National Park
0,Adam's Peak
2,Abalangoda Beach
3,Arugam Bay
4,Batticaloa Beach
5,Bundala
7,Weligama Beach


In [21]:
import pickle 
pickle.dumps(linear)
pickle.dumps(sig_score)
pickle.dumps(cosine_similarity)

b'\x80\x04\x952\x00\x00\x00\x00\x00\x00\x00\x8c\x18sklearn.metrics.pairwise\x94\x8c\x11cosine_similarity\x94\x93\x94.'