In [6]:
import pandas as pd
import re
import unicodedata
import nltk
import string
nltk.download('punkt') # got an error earlier, this is how it was resolved.
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

ja_df = pd.read_csv('../jamaica_tourism_data.csv');
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
print(ja_df.head())

                                               title  \
0          Read this if you’re traveling to Negril!!   
1                                 Negril Trip Report   
2                          Wonderful trip to Jamaica   
3                                     Negril beaches   
4  What I wish I new about getting around in Jamaica   

                                            selftext  score  num_comments  \
0  I recently stayed at Coco La Palm in Negril, a...     72            29   
1  (Leaving this report in hopes of helping someo...     35            30   
2                                         Beautiful.     34             4   
3                                                NaN     29             5   
4  This is what I learned about getting around in...     27             9   

                                                 url  
0  https://www.reddit.com/r/JamaicaTourism/commen...  
1  https://www.reddit.com/r/JamaicaTourism/commen...  
2             https://www.reddit.co

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tahjg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
pd.set_option("display.max_colwidth", None)
#ja_df.title.tolist()
#ja_df.selftext

# Cleaning Data

In [18]:
#Clean the text so that we don't input dirty data to the model, reduce noise as best as possible.
def clean_text(text):
    if pd.isna(text):
        return ""
     
     # Normalize Unicode characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')
     # Remove URLs
    text = re.sub(r'http\S+', '', text)
    text = text.replace("\n\n", " ")  # Remove double line breaks
    text = text.replace("\n", " ")  # Remove any remaining single line breaks
     # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove extra spaces
    text = ' '.join(text.split())
    
    # Convert text to lowercase
    text = text.lower()
     
    # Remove extra whitespaces and trim
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [19]:
# Tokenization is the process of splitting text into smaller units (words, sentences, or subwords).
#This helps the model understand individual words rather than treating the whole text as one big chunk.
def standardize_text(text):
    #tokenize text
    tokens = word_tokenize(text.lower()) #lowercase all words
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    return tokens
    

In [20]:
#Stemming reduces words to their root form so that similar words are treated the same.
#This helps the model generalize better instead of seeing "running" and "run" as two different words.
def stem_tokens(tokens):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in tokens]


In [21]:
ja_df.isnull().sum()

title           0
selftext        0
score           0
num_comments    0
url             0
dtype: int64

In [22]:
ja_df.dtypes

title           object
selftext        object
score            int64
num_comments     int64
url             object
dtype: object

In [23]:
#ja_df[ja_df.selftext.isna()]

In [24]:
ja_df = ja_df.dropna(subset=['selftext'])

In [25]:
ja_df.isna().sum()

title           0
selftext        0
score           0
num_comments    0
url             0
dtype: int64

In [26]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tahjg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\tahjg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tahjg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
ja_df['selftext'] = ja_df['selftext'].apply(clean_text)
ja_df['title'] = ja_df['title'].apply(clean_text)  
ja_df['tokens'] = ja_df['selftext'].apply(standardize_text)
ja_df['clean_tokens'] = ja_df['tokens'].apply(stem_tokens)


In [28]:
ja_df.to_csv('cleaned_jamaica_tourism_data.csv', index=False)


In [305]:
#ja_df.selftext

# Anything after this cell is for experimenting and testing and learning. Please ignore 

# Creating Embeddings

In [243]:
from transformers import BertTokenizer, BertModel
import torch
# Turning title and comments into embeddings
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text):
    # Tokenize and get model output
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()  # Use the mean of the hidden states

# Get BERT embeddings for titles and comments
ja_df['title_embeddings'] = ja_df['title'].apply(get_bert_embeddings)
ja_df['selftext_embeddings'] = ja_df['selftext'].apply(get_bert_embeddings)


In [245]:
ja_df.columns

Index(['title', 'selftext', 'score', 'num_comments', 'url', 'tokens',
       'clean_tokens', 'title_embeddings', 'selftext_embeddings',
       'tokens_string'],
      dtype='object')

In [None]:
print(ja_df['title_embeddings'].values.shape)
print(ja_df['selftext_embeddings'].values.shape)
#print(dense_tfidf_matrix.shape)

(686,)
(686,)
(686, 4851)


In [257]:
title_embeddings_reshaped = ja_df['title_embeddings'].values.reshape(-1, 1)
selftext_embeddings_reshaped = ja_df['selftext_embeddings'].values.reshape(-1, 1)


In [None]:
print(title_embeddings_reshaped.shape)
print(selftext_embeddings_reshaped.shape)
#print(dense_tfidf_matrix.shape)

(686, 1)
(686, 1)
(686, 4851)


In [None]:
import numpy as np

# Concatenate the embeddings for title, comments, and tokens
#combined_embeddings = np.concatenate([title_embeddings_reshaped, 
                                    #  selftext_embeddings_reshaped, 
                                    #  dense_tfidf_matrix], axis=1)


In [None]:
#print(combined_embeddings.shape)


(686, 4853)


In [None]:
# Ensure the embeddings are in numpy array format
#combined_embeddings = np.array(combined_embeddings)
## Didn't bother using the combined_embeddings. Maybe another time. The code below is where I created the embeddings I actually used.

# Make recommendations

In [361]:
import ollama
import numpy as np
import pandas as pd
import faiss
from transformers import BertTokenizer, BertModel
import torch

# Initialize BERT for embeddings
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()


# Generate embeddings for all posts
print("Generating embeddings...")
ja_df['embedding'] = ja_df['title'] + " " + ja_df['selftext'].fillna('')
ja_df['embedding'] = ja_df['embedding'].apply(lambda x: get_bert_embeddings(x[:1000]))  # Truncate long posts

# Build FAISS index
embeddings = np.stack(ja_df['embedding'].values).astype('float32')
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

def generate_llama_recommendation(query, top_k=3):
    # Get similar posts using FAISS
    query_embedding = get_bert_embeddings(query).astype('float32').reshape(1, -1)
    distances, indices = index.search(query_embedding, top_k)
    
    # Prepare context for LLM
    context = "\n\n".join(
        f"Title: {ja_df.iloc[idx]['title']}\nContent: {ja_df.iloc[idx]['selftext'][:500]}"
        for idx in indices[0]
    )
    
    # Generate with Llama 3
    prompt = f"""
    You are a Jamaica travel expert. Based on these posts, suggest 3 best options to answer the traveler's question.
    Start off wiht a greeting by saying "Wah Gwaan?"
     Whenever you're finished, say that this was developed by Tahj Gordon.
    Traveler's question: {query}
    
    Relevant posts:
    {context}
    
    Provide recommendations in this format:
    1. [Place/Activity]: [Brief description - why it's good for what they asked]
    2. [Place/Activity]: [Description]
    3. [Place/Activity]: [Description]
    
    Add practical tips if available.
    """
    
    response = ollama.generate(
        model='llama3.2',
        prompt=prompt,
        options={'temperature': 0.7}
    )
    
    return response['response']



Generating embeddings...


In [362]:
query = "How will I drive in Jamaica?"
print(generate_llama_recommendation(query))


Wah Gwaan?

Based on your interest in driving around Jamaica, I've got three top recommendations to get you behind the wheel:

1. **Rent a car with an automatic transmission**: This is the most convenient option for navigating Jamaica's roads, especially if you're not comfortable with manual transmissions. Many major rental companies like Hertz, Avis, and Budget offer cars with automatics.
2. **Book a private driver or chauffeur**: If you prefer not to drive yourself, consider booking a private driver for your 5-day trip. This option allows you to sit back, relax, and enjoy the scenery while someone else handles the driving. You can find private drivers through companies like Island Car Rental or local tourism boards.
3. **Use public transportation**: Jamaica's public transportation system is affordable and accessible. For shorter trips, buses (called "buses" locally) are a great way to get around. You can also take taxis or ride-hailing services like Uber.

Practical tips:

* Make sur