In [1]:
# Vector Database for Keywords
# This notebook creates a vector database for keyword search and similarity matching

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import os
from typing import List, Tuple
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")


Libraries imported successfully!


In [2]:
# Load and prepare the keywords data
def load_keywords_data(file_path: str) -> pd.DataFrame:
    """Load keywords data from CSV file"""
    try:
        df = pd.read_csv(file_path)
        print(f"Loaded {len(df)} keywords from {file_path}")
        print(f"Columns: {list(df.columns)}")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

# Load the keywords data
keywords_df = load_keywords_data('keywords_data.csv')
keywords_df.head()


Loaded 80157 keywords from keywords_data.csv
Columns: ['keyword', 'search_volume', 'competition', 'low_top_of_page_bid', 'high_top_of_page_bid', 'cpc', '2025_09', '2025_08', '2025_07', '2025_06', '2025_05', '2025_04', '2025_03', '2025_02', '2025_01', '2024_12', '2024_11', '2024_10', 'growth_slope', 'growth_r2', 'growth_consistency', 'growth_stability', 'sustained_growth_score', 'yoy_trend_%', '3month_trend_%']


Unnamed: 0,keyword,search_volume,competition,low_top_of_page_bid,high_top_of_page_bid,cpc,2025_09,2025_08,2025_07,2025_06,...,2024_12,2024_11,2024_10,growth_slope,growth_r2,growth_consistency,growth_stability,sustained_growth_score,yoy_trend_%,3month_trend_%
0,growth marketing agency,4400.0,8.0,0.25,1.49,1.83,6600.0,22200.0,1600.0,2400.0,...,1600.0,1900.0,1600.0,-568.714363,0.599418,0.090909,0.141573,-0.000784,-75.757576,0.0
1,caviar wholesale,390.0,92.0,0.59,3.17,2.5,590.0,390.0,320.0,260.0,...,720.0,390.0,590.0,22.65253,0.010215,0.454545,0.832011,0.000218,0.0,-18.055556
2,earbuds,1220000.0,100.0,0.03,0.92,0.32,1220000.0,1220000.0,1220000.0,1220000.0,...,1500000.0,1500000.0,1500000.0,30097.227314,0.563118,0.272727,1.0,0.003683,22.95082,0.0
3,therapy rates,6600.0,51.0,1.45,9.61,7.86,6600.0,6600.0,6600.0,6600.0,...,8100.0,9900.0,8100.0,217.091674,0.869349,0.454545,0.991116,0.011803,22.727273,0.0
4,classic tale,2400.0,8.0,0.05,0.45,0.06,2400.0,1900.0,1900.0,1900.0,...,2400.0,2400.0,2900.0,64.718032,0.241549,0.545455,1.0,0.003935,20.833333,20.833333


In [3]:
# Initialize the sentence transformer model
def initialize_model(model_name: str = 'all-MiniLM-L6-v2'):
    """Initialize the sentence transformer model for embeddings"""
    try:
        model = SentenceTransformer(model_name)
        print(f"Model '{model_name}' loaded successfully!")
        return model
    except Exception as e:
        print(f"Error loading model: {e}")
        return None

# Initialize the model
model = initialize_model()
# Rename keyword column to Keyword
keywords_df = keywords_df.rename(columns={'keyword': 'Keyword'})
print("Renamed 'keyword' column to 'Keyword'")



Model 'all-MiniLM-L6-v2' loaded successfully!
Renamed 'keyword' column to 'Keyword'


In [4]:
# Vector Database Class
class KeywordVectorDatabase:
    """Vector database for keyword search and similarity matching"""
    
    def __init__(self, model, keywords_df: pd.DataFrame):
        self.model = model
        self.keywords_df = keywords_df
        self.keywords = keywords_df['Keyword'].tolist()
        self.embeddings = None
        self.database_path = '/home/valentin/Home/Pioneers/InnovationMachine/vector_database.pkl'
        
    def create_embeddings(self, save_to_disk: bool = True):
        """Create embeddings for all keywords"""
        print("Creating embeddings for keywords...")
        try:
            # Create embeddings for all keywords
            self.embeddings = self.model.encode(self.keywords)
            print(f"Created embeddings with shape: {self.embeddings.shape}")
            
            if save_to_disk:
                self.save_database()
                
            return self.embeddings
        except Exception as e:
            print(f"Error creating embeddings: {e}")
            return None
    
    def save_database(self):
        """Save the vector database to disk"""
        try:
            database = {
                'keywords': self.keywords,
                'embeddings': self.embeddings,
                'keywords_df': self.keywords_df
            }
            with open(self.database_path, 'wb') as f:
                pickle.dump(database, f)
            print(f"Database saved to {self.database_path}")
        except Exception as e:
            print(f"Error saving database: {e}")
    
    def load_database(self):
        """Load the vector database from disk"""
        try:
            if os.path.exists(self.database_path):
                with open(self.database_path, 'rb') as f:
                    database = pickle.load(f)
                self.keywords = database['keywords']
                self.embeddings = database['embeddings']
                self.keywords_df = database['keywords_df']
                print(f"Database loaded from {self.database_path}")
                return True
            else:
                print("No existing database found")
                return False
        except Exception as e:
            print(f"Error loading database: {e}")
            return False
    
    def search_similar_keywords(self, query: str, n: int = 10) -> pd.DataFrame:
        """Search for n most similar keywords to the query"""
        try:
            if self.embeddings is None:
                print("No embeddings found. Please create embeddings first.")
                return pd.DataFrame()
            
            # Create embedding for the query
            query_embedding = self.model.encode([query])
            
            # Calculate cosine similarities
            similarities = cosine_similarity(query_embedding, self.embeddings)[0]
            
            # Get top n most similar keywords
            top_indices = np.argsort(similarities)[::-1][:n]
            
            # Create results dataframe
            results = []
            for idx in top_indices:
                keyword_data = self.keywords_df.iloc[idx].copy()
                keyword_data['similarity_score'] = similarities[idx]
                results.append(keyword_data)
            
            results_df = pd.DataFrame(results)
            return results_df
            
        except Exception as e:
            print(f"Error searching keywords: {e}")
            return pd.DataFrame()

# Initialize the vector database
vector_db = KeywordVectorDatabase(model, keywords_df)


In [5]:
# Create or load the vector database
def setup_vector_database():
    """Setup the vector database - create embeddings or load from disk"""
    # Try to load existing database first
    if vector_db.load_database():
        print("Vector database loaded from disk")
        return vector_db

    print("Creating new vector database...")
    vector_db.create_embeddings()
    
    return vector_db

# Setup the database
db = setup_vector_database()


No existing database found
Creating new vector database...
Creating embeddings for keywords...
Created embeddings with shape: (80157, 384)
Database saved to /home/valentin/Home/Pioneers/InnovationMachine/vector_database.pkl


In [11]:
# Main function to retrieve most relevant keywords
def get_most_relevant_keywords(text_query: str, n: int = 10) -> pd.DataFrame:
    """
    Retrieve the n most relevant keywords for a given text string
    
    Args:
        text_query (str): The text string to search for similar keywords
        n (int): Number of most relevant keywords to return (default: 10)
    
    Returns:
        pd.DataFrame: DataFrame containing the most relevant keywords with similarity scores
    """
    try:
        results = db.search_similar_keywords(text_query, n)
        if not results.empty:
            print(f"Found {len(results)} most relevant keywords for: '{text_query}'")
            # Display key columns for better readability
            display_cols = ['keyword', 'similarity_score', 'search_volume', 'competition', 'cpc']
            available_cols = [col for col in display_cols if col in results.columns]
            print(f"\nTop {n} most relevant keywords:")
            print(results[available_cols].to_string(index=False))
        else:
            print("No results found")
        return results
    except Exception as e:
        print(f"Error retrieving keywords: {e}")
        return pd.DataFrame()


In [12]:
# Test queries
test_queries = [
    "artificial intelligence machine learning",
    "health nutrition fitness",
    "business marketing strategy",
    "technology innovation startup",
    "data analytics insights"
]

for query in test_queries:
    print(f"\nQuery: '{query}'")
    print("-" * 50)
    results = get_most_relevant_keywords(query, n=5)
    print()



Query: 'artificial intelligence machine learning'
--------------------------------------------------
Found 5 most relevant keywords for: 'artificial intelligence machine learning'

Top 5 most relevant keywords:
 similarity_score  search_volume  competition  cpc
         0.832354          260.0         13.0 4.98
         0.766250         2400.0         13.0 2.25
         0.739939         1900.0          7.0 6.49
         0.721266         1600.0         15.0 8.19
         0.657029         3600.0         14.0 2.46


Query: 'health nutrition fitness'
--------------------------------------------------
Found 5 most relevant keywords for: 'health nutrition fitness'

Top 5 most relevant keywords:
 similarity_score  search_volume  competition   cpc
         0.919409         4400.0         79.0  1.79
         0.730558        33100.0          4.0  2.05
         0.723464           20.0         42.0 19.57
         0.717951         2900.0         10.0  1.43
         0.708035         1300.0         

In [13]:
# Additional utility functions
def get_keyword_info(keyword: str) -> dict:
    """Get detailed information about a specific keyword"""
    try:
        keyword_data = db.keywords_df[db.keywords_df['keyword'] == keyword]
        if not keyword_data.empty:
            return keyword_data.iloc[0].to_dict()
        else:
            return {"error": f"Keyword '{keyword}' not found"}
    except Exception as e:
        return {"error": f"Error retrieving keyword info: {e}"}

def get_database_stats() -> dict:
    """Get statistics about the vector database"""
    try:
        stats = {
            "total_keywords": len(db.keywords),
            "embedding_dimension": db.embeddings.shape[1] if db.embeddings is not None else 0,
            "database_size_mb": os.path.getsize(db.database_path) / (1024 * 1024) if os.path.exists(db.database_path) else 0
        }
        return stats
    except Exception as e:
        return {"error": f"Error getting stats: {e}"}

# Display database statistics
print("Vector Database Statistics:")
print("=" * 30)
stats = get_database_stats()
for key, value in stats.items():
    print(f"{key}: {value}")


Vector Database Statistics:
total_keywords: 80157
embedding_dimension: 384
database_size_mb: 134.032696723938


In [14]:
# Example usage and documentation
print("\n" + "="*60)
print("VECTOR DATABASE USAGE EXAMPLES")
print("="*60)

print("""
# Basic usage:
results = get_most_relevant_keywords("your search query", n=10)

# Get specific keyword information:
keyword_info = get_keyword_info("specific keyword")

# Get database statistics:
stats = get_database_stats()

# The main function returns a pandas DataFrame with:
# - keyword: The keyword text
# - similarity_score: Cosine similarity score (0-1)
# - search_volume: Search volume data
# - competition: Competition level
# - cpc: Cost per click
# - All other columns from the original keywords data
""")

print("\nVector database setup complete! ðŸš€")
print("You can now use get_most_relevant_keywords(query, n) to find similar keywords.")



VECTOR DATABASE USAGE EXAMPLES

# Basic usage:
results = get_most_relevant_keywords("your search query", n=10)

# Get specific keyword information:
keyword_info = get_keyword_info("specific keyword")

# Get database statistics:
stats = get_database_stats()

# The main function returns a pandas DataFrame with:
# - keyword: The keyword text
# - similarity_score: Cosine similarity score (0-1)
# - search_volume: Search volume data
# - competition: Competition level
# - cpc: Cost per click
# - All other columns from the original keywords data


Vector database setup complete! ðŸš€
You can now use get_most_relevant_keywords(query, n) to find similar keywords.


In [16]:
query = """
cat nutrition
"""

get_most_relevant_keywords(query, n=100).sort_values(by='search_volume', ascending=False).head(20)

Found 100 most relevant keywords for: '
cat nutrition
'

Top 100 most relevant keywords:
 similarity_score  search_volume  competition   cpc
         0.784641          880.0         14.0  0.26
         0.705592        74000.0         16.0  1.97
         0.668814       110000.0         97.0  1.61
         0.659846          170.0         12.0  0.63
         0.650054         1000.0         45.0  2.42
         0.636870         1600.0         84.0  1.62
         0.614300         9900.0        100.0  2.39
         0.599223         1600.0         16.0  2.08
         0.592155          480.0        100.0  2.46
         0.589018       135000.0         77.0  4.47
         0.576854        14800.0         44.0  0.43
         0.557217        74000.0         64.0  9.31
         0.555190          590.0        100.0  0.70
         0.540150         1900.0         98.0  0.74
         0.540135         6600.0         39.0  0.81
         0.539305        33100.0          3.0  0.56
         0.538581        60

Unnamed: 0,Keyword,search_volume,competition,low_top_of_page_bid,high_top_of_page_bid,cpc,2025_09,2025_08,2025_07,2025_06,...,2024_11,2024_10,growth_slope,growth_r2,growth_consistency,growth_stability,sustained_growth_score,yoy_trend_%,3month_trend_%,similarity_score
64764,cat care routines,246000.0,100.0,0.1,0.87,0.53,246000.0,368000.0,450000.0,368000.0,...,246000.0,201000.0,-9303.567338,0.390302,0.272727,0.616312,-0.002146,-18.292683,-33.222591,0.512585
21637,cat agility steps,201000.0,100.0,0.4,2.46,0.99,201000.0,201000.0,165000.0,165000.0,...,246000.0,246000.0,5535.805429,0.497693,0.636364,1.0,0.008772,22.38806,22.38806,0.488597
6142,nutrient supplements,201000.0,71.0,0.06,0.44,0.75,165000.0,201000.0,246000.0,165000.0,...,368000.0,165000.0,958.622506,0.02149,0.454545,0.785912,3.6e-05,0.0,22.222222,0.527618
18087,cat beds,135000.0,100.0,0.18,1.07,0.65,110000.0,110000.0,110000.0,110000.0,...,165000.0,135000.0,5334.284024,0.538491,0.454545,0.986309,0.011091,22.727273,-18.181818,0.487922
20026,pet food,135000.0,77.0,0.29,9.92,4.47,110000.0,135000.0,90500.0,135000.0,...,165000.0,135000.0,3356.343007,0.871544,0.636364,0.959641,0.013375,22.727273,-18.181818,0.589018
74967,nutritional data platform,135000.0,100.0,0.4,4.25,1.6,165000.0,135000.0,135000.0,110000.0,...,110000.0,135000.0,-1040.000009,0.629054,0.272727,0.741483,-0.001024,-18.181818,22.727273,0.461914
23168,kitten food,110000.0,97.0,0.51,4.4,1.61,135000.0,135000.0,165000.0,165000.0,...,60500.0,90500.0,-8517.855008,0.737284,0.363636,0.49219,-0.009588,-32.962963,49.586777,0.668814
60024,animal feeding equipment,90500.0,90.0,0.04,0.95,0.19,90500.0,90500.0,74000.0,74000.0,...,90500.0,90500.0,959.794098,0.095958,0.363636,0.951131,0.000388,0.0,0.0,0.455414
45212,essential nutrients,90500.0,100.0,0.12,1.28,0.19,90500.0,90500.0,90500.0,74000.0,...,110000.0,110000.0,2113.77068,0.157586,0.636364,1.0,0.002384,21.546961,21.546961,0.457215
49178,organic nutrients,74000.0,80.0,0.15,0.88,0.7,3600.0,3600.0,9900.0,49500.0,...,2400.0,2400.0,-6603.963818,0.246354,0.363636,0.220926,-0.002037,-33.333333,26.315789,0.449238
