In [28]:
# Vector Database for Keywords
# This notebook creates a vector database for keyword search and similarity matching

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import os
from typing import List, Tuple
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")


Libraries imported successfully!


In [35]:
# Load and prepare the keywords data
def load_keywords_data(file_path: str) -> pd.DataFrame:
    """Load keywords data from CSV file"""
    try:
        df = pd.read_csv(file_path)
        print(f"Loaded {len(df)} keywords from {file_path}")
        print(f"Columns: {list(df.columns)}")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

# Load the keywords data
keywords_df = load_keywords_data('unique_keywords.csv')
keywords_df.head()


Loaded 56130 keywords from unique_keywords.csv
Columns: ['Keyword']


Unnamed: 0,Keyword
0,cat brain booster
1,corporate startup partnerships
2,banjo for sale
3,student isolation
4,funeral monument pricing


In [36]:
# Initialize the sentence transformer model
def initialize_model(model_name: str = 'all-MiniLM-L6-v2'):
    """Initialize the sentence transformer model for embeddings"""
    try:
        model = SentenceTransformer(model_name)
        print(f"Model '{model_name}' loaded successfully!")
        return model
    except Exception as e:
        print(f"Error loading model: {e}")
        return None

# Initialize the model
model = initialize_model()


Model 'all-MiniLM-L6-v2' loaded successfully!


In [37]:
# Vector Database Class
class KeywordVectorDatabase:
    """Vector database for keyword search and similarity matching"""
    
    def __init__(self, model, keywords_df: pd.DataFrame):
        self.model = model
        self.keywords_df = keywords_df
        self.keywords = keywords_df['Keyword'].tolist()
        self.embeddings = None
        self.database_path = '/home/valentin/Home/Pioneers/InnovationMachine/vector_database.pkl'
        
    def create_embeddings(self, save_to_disk: bool = True):
        """Create embeddings for all keywords"""
        print("Creating embeddings for keywords...")
        try:
            # Create embeddings for all keywords
            self.embeddings = self.model.encode(self.keywords)
            print(f"Created embeddings with shape: {self.embeddings.shape}")
            
            if save_to_disk:
                self.save_database()
                
            return self.embeddings
        except Exception as e:
            print(f"Error creating embeddings: {e}")
            return None
    
    def save_database(self):
        """Save the vector database to disk"""
        try:
            database = {
                'keywords': self.keywords,
                'embeddings': self.embeddings,
                'keywords_df': self.keywords_df
            }
            with open(self.database_path, 'wb') as f:
                pickle.dump(database, f)
            print(f"Database saved to {self.database_path}")
        except Exception as e:
            print(f"Error saving database: {e}")
    
    def load_database(self):
        """Load the vector database from disk"""
        try:
            if os.path.exists(self.database_path):
                with open(self.database_path, 'rb') as f:
                    database = pickle.load(f)
                self.keywords = database['keywords']
                self.embeddings = database['embeddings']
                self.keywords_df = database['keywords_df']
                print(f"Database loaded from {self.database_path}")
                return True
            else:
                print("No existing database found")
                return False
        except Exception as e:
            print(f"Error loading database: {e}")
            return False
    
    def search_similar_keywords(self, query: str, n: int = 10) -> pd.DataFrame:
        """Search for n most similar keywords to the query"""
        try:
            if self.embeddings is None:
                print("No embeddings found. Please create embeddings first.")
                return pd.DataFrame()
            
            # Create embedding for the query
            query_embedding = self.model.encode([query])
            
            # Calculate cosine similarities
            similarities = cosine_similarity(query_embedding, self.embeddings)[0]
            
            # Get top n most similar keywords
            top_indices = np.argsort(similarities)[::-1][:n]
            
            # Create results dataframe
            results = []
            for idx in top_indices:
                keyword_data = self.keywords_df.iloc[idx].copy()
                keyword_data['similarity_score'] = similarities[idx]
                results.append(keyword_data)
            
            results_df = pd.DataFrame(results)
            return results_df
            
        except Exception as e:
            print(f"Error searching keywords: {e}")
            return pd.DataFrame()

# Initialize the vector database
vector_db = KeywordVectorDatabase(model, keywords_df)


In [38]:
# Create or load the vector database
def setup_vector_database():
    """Setup the vector database - create embeddings or load from disk"""
    # Try to load existing database first
    print("Creating new vector database...")
    vector_db.create_embeddings()
    
    return vector_db

# Setup the database
db = setup_vector_database()


Creating new vector database...
Creating embeddings for keywords...
Created embeddings with shape: (56130, 384)
Database saved to /home/valentin/Home/Pioneers/InnovationMachine/vector_database.pkl


In [39]:
# Main function to retrieve most relevant keywords
def get_most_relevant_keywords(text_query: str, n: int = 10) -> pd.DataFrame:
    """
    Retrieve the n most relevant keywords for a given text string
    
    Args:
        text_query (str): The text string to search for similar keywords
        n (int): Number of most relevant keywords to return (default: 10)
    
    Returns:
        pd.DataFrame: DataFrame containing the most relevant keywords with similarity scores
    """
    try:
        results = db.search_similar_keywords(text_query, n)
        if not results.empty:
            print(f"Found {len(results)} most relevant keywords for: '{text_query}'")
            # Display key columns for better readability
            display_cols = ['keyword', 'similarity_score', 'search_volume', 'competition', 'cpc']
            available_cols = [col for col in display_cols if col in results.columns]
            print(f"\nTop {n} most relevant keywords:")
            print(results[available_cols].to_string(index=False))
        else:
            print("No results found")
        return results
    except Exception as e:
        print(f"Error retrieving keywords: {e}")
        return pd.DataFrame()

# Test the function with some sample queries
print("Testing the vector database with sample queries...")
print("=" * 60)


Testing the vector database with sample queries...


In [40]:
# Test queries
test_queries = [
    "artificial intelligence machine learning",
    "health nutrition fitness",
    "business marketing strategy",
    "technology innovation startup",
    "data analytics insights"
]

for query in test_queries:
    print(f"\nQuery: '{query}'")
    print("-" * 50)
    results = get_most_relevant_keywords(query, n=5)
    print()



Query: 'artificial intelligence machine learning'
--------------------------------------------------
Found 5 most relevant keywords for: 'artificial intelligence machine learning'

Top 5 most relevant keywords:
 similarity_score
         0.832354
         0.721266
         0.672322
         0.657029
         0.652755


Query: 'health nutrition fitness'
--------------------------------------------------
Found 5 most relevant keywords for: 'health nutrition fitness'

Top 5 most relevant keywords:
 similarity_score
         0.730558
         0.723464
         0.717951
         0.713488
         0.702746


Query: 'business marketing strategy'
--------------------------------------------------
Found 5 most relevant keywords for: 'business marketing strategy'

Top 5 most relevant keywords:
 similarity_score
         0.925557
         0.835108
         0.788846
         0.784925
         0.771821


Query: 'technology innovation startup'
--------------------------------------------------
Foun

In [41]:
# Additional utility functions
def get_keyword_info(keyword: str) -> dict:
    """Get detailed information about a specific keyword"""
    try:
        keyword_data = db.keywords_df[db.keywords_df['keyword'] == keyword]
        if not keyword_data.empty:
            return keyword_data.iloc[0].to_dict()
        else:
            return {"error": f"Keyword '{keyword}' not found"}
    except Exception as e:
        return {"error": f"Error retrieving keyword info: {e}"}

def get_database_stats() -> dict:
    """Get statistics about the vector database"""
    try:
        stats = {
            "total_keywords": len(db.keywords),
            "embedding_dimension": db.embeddings.shape[1] if db.embeddings is not None else 0,
            "database_size_mb": os.path.getsize(db.database_path) / (1024 * 1024) if os.path.exists(db.database_path) else 0
        }
        return stats
    except Exception as e:
        return {"error": f"Error getting stats: {e}"}

# Display database statistics
print("Vector Database Statistics:")
print("=" * 30)
stats = get_database_stats()
for key, value in stats.items():
    print(f"{key}: {value}")


Vector Database Statistics:
total_keywords: 56130
embedding_dimension: 384
database_size_mb: 83.64491939544678


In [42]:
# Example usage and documentation
print("\n" + "="*60)
print("VECTOR DATABASE USAGE EXAMPLES")
print("="*60)

print("""
# Basic usage:
results = get_most_relevant_keywords("your search query", n=10)

# Get specific keyword information:
keyword_info = get_keyword_info("specific keyword")

# Get database statistics:
stats = get_database_stats()

# The main function returns a pandas DataFrame with:
# - keyword: The keyword text
# - similarity_score: Cosine similarity score (0-1)
# - search_volume: Search volume data
# - competition: Competition level
# - cpc: Cost per click
# - All other columns from the original keywords data
""")

print("\nVector database setup complete! ðŸš€")
print("You can now use get_most_relevant_keywords(query, n) to find similar keywords.")



VECTOR DATABASE USAGE EXAMPLES

# Basic usage:
results = get_most_relevant_keywords("your search query", n=10)

# Get specific keyword information:
keyword_info = get_keyword_info("specific keyword")

# Get database statistics:
stats = get_database_stats()

# The main function returns a pandas DataFrame with:
# - keyword: The keyword text
# - similarity_score: Cosine similarity score (0-1)
# - search_volume: Search volume data
# - competition: Competition level
# - cpc: Cost per click
# - All other columns from the original keywords data


Vector database setup complete! ðŸš€
You can now use get_most_relevant_keywords(query, n) to find similar keywords.


In [43]:
query = """
Build next apple watch
"""

get_most_relevant_keywords(query, n=10)

Found 10 most relevant keywords for: '
Build next apple watch
'

Top 10 most relevant keywords:
 similarity_score
         0.607947
         0.605693
         0.582254
         0.571385
         0.555391
         0.550626
         0.544932
         0.543916
         0.542133
         0.541360


Unnamed: 0,Keyword,similarity_score
20672,buy smartwatch,0.607947
39307,generation smartwatch,0.605693
36142,display for watches,0.582254
27003,custom watchmaker,0.571385
2351,watch repair,0.555391
16356,race smartwatch,0.550626
27083,gps running watch,0.544932
42761,running watch,0.543916
9164,running watch price,0.542133
25702,running watch sale,0.54136
