In [1]:
import pandas as pd

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cosine_similarity(keywords1, keywords2):
    """
    Calculates the cosine similarity between two sets of keywords using TF-IDF.

    Args:
        keywords1: A list of strings representing the keywords for the first website.
        keywords2: A list of strings representing the keywords for the second website.

    Returns:
        The cosine similarity score (a float between 0 and 1).
    """

    # Combine the keywords into a single list of strings, where each element is
    # the keywords for one website.
    documents = [" ".join(keywords1), " ".join(keywords2)]

    # Create a TF-IDF vectorizer. This will:
    # 1. Tokenize the strings (split them into words).
    # 2. Count the term frequencies.
    # 3. Calculate the IDF for each term.
    # 4. Calculate the TF-IDF scores.
    vectorizer = TfidfVectorizer()

    # Fit and transform the documents into a TF-IDF matrix.
    tfidf_matrix = vectorizer.fit_transform(documents)

    # Calculate the cosine similarity between the two TF-IDF vectors.
    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

    return cosine_sim


similarity = calculate_cosine_similarity(your_keywords, competitor_keywords)
print(f"Cosine Similarity: {similarity}")




Cosine Similarity: 0.11234277891542774
Cosine Similarity (cleaned): 0.13328575937887677


attractions, dubai, city, travel, guide, trip, desert, visit, world, best, enjoy, restaurants, experience, book, traditional, activities, free, things, malls, find, need, hotels, offers, take, international

dubai restaurants al burj restaurant best khalifa water dining shopping creek difc visa summer arab activities food people things home park polo family festival near

In [7]:
keywords = 'attractions, dubai, city, travel, guide, trip, desert, visit, world, best, enjoy, restaurants, experience, book, traditional, activities, free, things, malls, find, need, hotels, offers, take, international'

In [3]:
import os
from dotenv import load_dotenv
load_dotenv()
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [4]:
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(model = "gemini-2.0-flash")

In [22]:
template = """ 
You are an AI assitant, your job is to read input keywords from SEO sites, then generate a Hypothetical document from the given 
keywords {keywords}
Note: give a straight forward answer without adding extra words, started immediately with the Hypothetical document
"""
output_parser = StrOutputParser()
prompt = PromptTemplate.from_template(template)
chain = prompt | llm | output_parser

response = chain.invoke({"keywords":keywords})
response

"## Dubai: Your Ultimate Travel Guide\n\n**Discover the Best of Dubai: Attractions, Activities & Experiences**\n\n**Planning Your Trip to Dubai:**\n\n*   **Why Visit Dubai?** A world-class city offering a unique blend of traditional culture and modern marvels.\n*   **Best Time to Travel:** Enjoy Dubai's pleasant weather during the winter months.\n*   **Booking Your Trip:** Find the best hotels and flight offers for your budget.\n\n**Must-See Attractions:**\n\n*   **Iconic Landmarks:** Burj Khalifa, Dubai Frame, The Palm Jumeirah.\n*   **Cultural Experiences:** Explore traditional souks, visit historical neighborhoods.\n*   **Desert Adventures:** Take a thrilling desert safari and experience Bedouin culture.\n\n**Things To Do in Dubai:**\n\n*   **Activities for Everyone:** From water parks to theme parks, Dubai has it all.\n*   **Free Things to Do:** Explore beaches, parks, and public art installations.\n*   **Shopping Paradise:** Discover world-class malls and international brands.\n* 

In [18]:
keywords1 = 'dubai restaurants al burj restaurant best khalifa water dining shopping creek difc visa summer arab activities food people things home park polo family festival near'

In [23]:

response1 = chain.invoke({"keywords":keywords1})
response1

"## Dubai: Your Guide to Food, Fun, and Festivities\n\n**Restaurants:**\n\n*   **Al Burj Restaurant:** Experience unparalleled dining with breathtaking Khalifa views.\n*   **Best Restaurants:** Discover Dubai's diverse culinary scene, from traditional Arab food to international flavors.\n*   **Water Dining:** Enjoy unique dining experiences on the Creek or along the coast.\n*   **DIFC Restaurants:** Explore upscale dining options in Dubai's financial hub.\n\n**Activities & Things To Do:**\n\n*   **Summer Activities:** Beat the heat with indoor attractions and water parks.\n*   **Arab Activities:** Immerse yourself in local culture with traditional experiences.\n*   **Things to Do:** Explore Dubai's iconic landmarks and hidden gems.\n*   **Family Activities:** Create lasting memories with family-friendly attractions.\n*   **Festival:** Catch the latest Dubai shopping festival or food festival.\n*   **Near Me:** Find the best activities and restaurants near you.\n*   **Park:** Relax and 

In [24]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_paragraph_similarity(paragraph1, paragraph2):
    """
    Calculates the cosine similarity between two paragraphs using Sentence Transformers.

    Args:
        paragraph1: The first paragraph (a string).
        paragraph2: The second paragraph (a string).

    Returns:
        The cosine similarity score (a float between 0 and 1).
    """

    # Load a pre-trained Sentence Transformer model.  "all-mpnet-base-v2" is a good general-purpose model.
    # Other models available: https://www.sbert.net/docs/pretrained_models.html
    model = SentenceTransformer("all-MiniLM-L6-v2") # You may need to download this on the first run

    # Encode the paragraphs into vectors.
    embedding1 = model.encode(paragraph1)
    embedding2 = model.encode(paragraph2)

    # Reshape the embeddings to be 2D arrays for cosine_similarity
    embedding1 = embedding1.reshape(1, -1)
    embedding2 = embedding2.reshape(1, -1)

    # Calculate the cosine similarity.
    similarity = cosine_similarity(embedding1, embedding2)[0][0]

    return similarity



similarity = calculate_paragraph_similarity(response, response1)
print(f"Paragraph Similarity (SBERT): {similarity}")

Paragraph Similarity (SBERT): 0.9074810743331909


In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(model = "gemini-2.0-flash")

def analyze_intent(keywords):
    template = """ 
        You are a specialized query generator.  Given the following keywords, your sole task is to formulate a concise and effective search query for finding information on the web.  Return ONLY the search query itself.

        Keywords: {keywords}
    """
    output_parser = StrOutputParser()
    prompt = PromptTemplate.from_template(template)
    chain = prompt | llm | output_parser

    response = chain.invoke({"keywords":keywords})
    return response

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
keywords1 = 'dubai restaurants al burj restaurant best khalifa water dining shopping creek difc visa summer arab activities food people things home park polo family festival near'
analyze_intent(keywords1)

'Dubai restaurants Burj Khalifa Creek DIFC summer activities visa family'

In [7]:
import os
from dotenv import load_dotenv
load_dotenv()
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(model = "gemini-2.0-flash")

def analyze_intent(keywords):
    template = """
        You are a specialized intent analyzer. Given the following keywords extracted from a website, 
        your SOLE TASK is to identify the PRIMARY INTENT of the webpage.

        Keywords: {keywords}
    """
    output_parser = StrOutputParser()
    prompt = PromptTemplate.from_template(template)
    chain = prompt | llm | output_parser

    response = chain.invoke({"keywords":keywords})
    return response

In [8]:
analyze_intent(keywords1)

'The primary intent of the webpage is to provide information and recommendations related to **tourism and leisure activities in Dubai, with a focus on dining and attractions like the Burj Khalifa.**'

In [11]:
import os
from dotenv import load_dotenv
load_dotenv()
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(model = "gemini-2.0-flash")

def analyze_intent(keywords):
    template = """ 
        You are a specialized query generator. Given the following keywords, 
        your sole task is to formulate a concise and effective search query for finding comprehensive information on the web,
        covering a broad range of topics mentioned.  Do not overly focus on one specific topic; aim for a balanced and general query.
        Return ONLY the search query itself.

        Keywords: {keywords}
    """
    output_parser = StrOutputParser()
    prompt = PromptTemplate.from_template(template)
    chain = prompt | llm | output_parser

    response = chain.invoke({"keywords":keywords})
    return response

# Example Usage (Testing):
keywords = "dubai restaurants al burj restaurant best khalifa water dining shopping creek difc visa summer arab activities food people things home park polo family festival near"
query = analyze_intent(keywords)
print(f"Generated Search Query: {query}")

Generated Search Query: Dubai travel guide: restaurants, activities, shopping, and lifestyle


In [15]:
import os
from dotenv import load_dotenv
load_dotenv()
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(model = "gemini-2.0-flash")

def find_missing_topic(keywords):
    template = """
        You are a specialized topic analyzer. Given the following keywords extracted from a website, 
        your SOLE TASK is to identify the POTENTIAL MISSING TOPICS of a webpage. 
        Return ONLY the identified POTENTIAL MISSING TOPICS.

        Keywords: {keywords}
    """
    output_parser = StrOutputParser()
    prompt = PromptTemplate.from_template(template)
    chain = prompt | llm | output_parser

    response = chain.invoke({"keywords":keywords})
    return response

In [17]:
keywords1 = 'dubai, restaurants, al, burj, restaurant, best, khalifa, water, dining, shopping, creek, difc, visa, summer, arab, activities, food, people, things, home, park, polo, family, festival, near'
find_missing_topic(keywords1)

'*   **Specific Cuisine Types:** (e.g., Indian, Italian, Chinese, Seafood, Emirati) - The keywords mention "food" but lack specifics.\n*   **Price Ranges:** (e.g., Budget-friendly, Mid-range, Fine Dining) - Helps users filter based on their budget.\n*   **Restaurant Ambiance/Atmosphere:** (e.g., Romantic, Lively, Casual, Business)\n*   **Dress Code:** Knowing the expected attire is useful for planning.\n*   **Reservation Information:** How to book a table, cancellation policies.\n*   **Menu Highlights/Signature Dishes:** Enticing descriptions of what to eat.\n*   **Dietary Options:** (e.g., Vegetarian, Vegan, Gluten-Free, Halal)\n*   **Deals and Promotions:** (e.g., Happy Hour, Brunch Deals, Set Menus)\n*   **Customer Reviews/Ratings:** Social proof and different perspectives.\n*   **Transportation/Parking:** How to get to the restaurants, parking availability.\n*   **Specific Locations/Areas:** Beyond just "Dubai," more granular location details.\n*   **Events and Entertainment:** Liv