In [4]:
def load_word_list(words_list):
    """
    Convert the list of words into a set for efficient lookup.
    
    :param words_list: List of words to be used as a dictionary
    :return: Set of words for fast checking
    """
    return set(words_list)

def is_two_word_combination(input_string, word_set):
    """
    Check if the input string is a combination of two words from the word set.
    
    :param input_string: String to check
    :param word_set: Set of valid words
    :return: Boolean indicating if the string is a two-word combination
    """
    # Split the input string into words
    words = input_string.split()
    
    # If input is not multiple words, return False
    if len(words) < 2:
        return False
    
    # Try all possible two-word combinations
    for i in range(1, len(words)):
        # Split the input into two parts
        first_part = ' '.join(words[:i])
        second_part = ' '.join(words[i:])
        
        # Check if both parts are in the word set
        if first_part in word_set and second_part in word_set:
            return True
    
    return False

# Example usage
# Assume words_list is your list of 1000 objects
words_list = [
    "hello", "world", "python", "programming", 
    "code", "machine", "learning", "artificial", 
    "intelligence", "data", "science"
    # ... your full list of 1000 words/phrases
]

# Create the word set for efficient lookup
word_set = load_word_list(words_list)

# Test the function
def test_word_combination_checker():
    # Test cases
    test_inputs = [
        "machine learning",  # Should return True
        "artificial intelligence",  # Should return True
        "hello world",  # Should return True
        "python code",  # Depends on your actual list
        "random phrase",  # Should return False
        "single"  # Should return False
    ]
    
    for input_str in test_inputs:
        print(f"'{input_str}': {is_two_word_combination(input_str, word_set)}")

# Uncomment to run tests
test_word_combination_checker()

'machine learning': True
'artificial intelligence': True
'hello world': True
'python code': True
'random phrase': False
'single': False


In [12]:
def load_word_list(words_list):
    """
    Convert the list of words into a set for efficient lookup.
    
    :param words_list: List of words or phrases to be used as a dictionary
    :return: Set of words/phrases for fast checking
    """
    return set(words_list)

def is_word_combination(input_string, word_set):
    """
    Check if the input string is a combination of words from the word set.
    
    :param input_string: String to check
    :param word_set: Set of valid words/phrases
    :return: Boolean indicating if the string is a valid word combination
    """
    # Split the input string into words
    words = input_string.split()
    
    # If input is single word or empty, return False
    # if len(words) < 2:
    #     return False
    
    # Try all possible combinations of splitting the input
    def can_split(start, memo=None):
        # Memoization to optimize recursive calls
        if memo is None:
            memo = {}
        
        # Base case: reached the end of the string
        if start == len(words):
            return True
        
        # Check memoized results
        if start in memo:
            return memo[start]
        
        # Try all possible splits from this starting point
        for end in range(start + 1, len(words) + 1):
            # Get the current substring
            current_phrase = ' '.join(words[start:end])
            
            # Check if current phrase is in word set
            if current_phrase in word_set:
                # Recursively check the rest of the string
                if can_split(end, memo):
                    memo[start] = True
                    return True
        
        # No valid split found
        memo[start] = False
        return False
    
    # Attempt to split the entire input
    return can_split(0)

# Example usage
def test_word_combination_checker():
    # Comprehensive word list with various lengths
    words_list = [
        "machine", "learning", "machine learning", 
        "artificial", "intelligence", "artificial intelligence", 
        "deep", "neural", "network", "deep neural network",
        "natural", "language", "processing", 
        "natural language processing",
        "data", "science", "data science"
    ]
    
    # Create the word set for efficient lookup
    word_set = load_word_list(words_list)
    
    # Test cases
    test_inputs = [
        "machine learning",  # Simple two-word combination
        "artificial intelligence",  # Two-word combination
        "deep neural network",  # Three-word combination
        "machine learning artificial intelligence",  # Multiple combinations
        "data science processing",  # Partial match
        "random phrase",  # No match
        "single",  # Single word
        "deep neural network processing" , # Complex combination
        "deep neural network",
        "data",
        "deep neural network data",
        "data deep neural network",
        "data network"
    ]
    
    # Run tests
    for input_str in test_inputs:
        print(f"'{input_str}': {is_word_combination(input_str, word_set)}")

# Uncomment to run tests
test_word_combination_checker()

'machine learning': True
'artificial intelligence': True
'deep neural network': True
'machine learning artificial intelligence': True
'data science processing': True
'random phrase': False
'single': False
'deep neural network processing': True
'deep neural network': True
'data': True
'deep neural network data': True
'data deep neural network': True
'data network': True


In [34]:
import csv

def read_column_from_csv(file_path, column_name='Title Name'):
    column_values = []
    
    with open(file_path, 'r', encoding='utf-8') as csvfile:
        # CSV reader
        csv_reader = csv.DictReader(csvfile)
        
        for row in csv_reader:
            column_values.append(row[column_name])
    
    return column_values

file_path = 'final.csv' 
title_names = read_column_from_csv(file_path)

print(title_names)



In [32]:
def load_word_list(words_list):
    """
    Convert the list of words into a set for efficient lookup.
    
    :param words_list: List of words or phrases to be used as a dictionary
    :return: Set of words/phrases for fast checking
    """
    return set(words_list)

def is_word_combination(input_string, word_set):
    """
    Check if the input string is a combination of words from the word set.
    
    :param input_string: String to check
    :param word_set: Set of valid words/phrases
    :return: Boolean indicating if the string is a valid word combination
    """
    # Split the input string into words
    words = input_string.split()
    
    # If input is single word or empty, return False
    # if len(words) < 2:
    #     return False
    
    # Try all possible combinations of splitting the input
    def can_split(start, memo=None):
        # Memoization to optimize recursive calls
        if memo is None:
            memo = {}
        
        # Base case: reached the end of the string
        if start == len(words):
            return True
        
        # Check memoized results
        if start in memo:
            return memo[start]
        
        # Try all possible splits from this starting point
        for end in range(start + 1, len(words) + 1):
            # Get the current substring
            current_phrase = ' '.join(words[start:end])
            
            # Check if current phrase is in word set
            if current_phrase in word_set:
                # Recursively check the rest of the string
                if can_split(end, memo):
                    memo[start] = True
                    return True
        
        # No valid split found
        memo[start] = False
        return False
    
    # Attempt to split the entire input
    return can_split(0)

# Example usage
def test_word_combination_checker():
    # Comprehensive word list with various lengths
    words_list = title_names
    word_set = load_word_list(words_list)
    
    # Test cases
    test_inputs = [
        "JAN JAGRAN YOGIC SCIENCES",
        "DAINIK JAGRAN",
        "DAINIK JAGRAN JAN JAGRAN YOGIC SCIENCES",
        "JAN JAGRAN YOGIC SCIENCES DAINIK JAGRAN",
        "DAINIK YOGIC SCIENCES",
        "DAINIK JAN JAGRAN YOGIC SCIENCES",
        "YOGIC SCIENCES"
    ]
    
    # Run tests
    for input_str in test_inputs:
        print(f"'{input_str}': {is_word_combination(input_str, word_set)}")

# Uncomment to run tests
test_word_combination_checker()

'JAN JAGRAN YOGIC SCIENCES': True
'DAINIK JAGRAN': True
'DAINIK JAGRAN JAN JAGRAN YOGIC SCIENCES': True
'JAN JAGRAN YOGIC SCIENCES DAINIK JAGRAN': True
'DAINIK YOGIC SCIENCES': False
'DAINIK JAN JAGRAN YOGIC SCIENCES': False
'YOGIC SCIENCES': False


### Local CSV version

In [40]:
import csv

def read_column_from_csv(file_path, column_name='Title Name'):
    """
    Read a specific column from a CSV file.
    
    :param file_path: Path to the CSV file
    :param column_name: Name of the column to extract
    :return: List of values from the specified column
    """
    column_values = []
    
    with open(file_path, 'r', encoding='utf-8') as csvfile:
        # CSV reader
        csv_reader = csv.DictReader(csvfile)
        
        for row in csv_reader:
            column_values.append(row[column_name])
    
    return column_values

def load_word_list(words_list):
    """
    Convert the list of words into a set for efficient lookup.
    
    :param words_list: List of words or phrases to be used as a dictionary
    :return: Set of words/phrases for fast checking
    """
    return set(words_list)

def is_word_combination(input_string, word_set):
    """
    Check if the input string is a combination of words from the word set.
    
    :param input_string: String to check
    :param word_set: Set of valid words/phrases
    :return: Boolean indicating if the string is a valid word combination
    """
    # Split the input string into words
    words = input_string.split()
    
    # Try all possible combinations of splitting the input
    def can_split(start, memo=None):
        # Memoization to optimize recursive calls
        if memo is None:
            memo = {}
        
        # Base case: reached the end of the string
        if start == len(words):
            return True
        
        # Check memoized results
        if start in memo:
            return memo[start]
        
        # Try all possible splits from this starting point
        for end in range(start + 1, len(words) + 1):
            # Get the current substring
            current_phrase = ' '.join(words[start:end])
            
            # Check if current phrase is in word set
            if current_phrase in word_set:
                # Recursively check the rest of the string
                if can_split(end, memo):
                    memo[start] = True
                    return True
        
        # No valid split found
        memo[start] = False
        return False
    
    # Attempt to split the entire input
    return can_split(0)

def test_word_combination_checker(file_path='final.csv'):
    """
    Test the word combination checker using titles from a CSV file.
    
    :param file_path: Path to the CSV file containing titles
    """
    # Read titles from CSV
    title_names = read_column_from_csv(file_path)
    
    # Create an efficient word set
    word_set = load_word_list(title_names)
    
    # Test cases
    test_inputs = [
        "JAN JAGRAN YOGIC SCIENCES",
        "DAINIK JAGRAN",
        "DAINIK JAGRAN JAN JAGRAN YOGIC SCIENCES",
        "JAN JAGRAN YOGIC SCIENCES DAINIK JAGRAN",
        "DAINIK YOGIC SCIENCES",
        "DAINIK JAN JAGRAN YOGIC SCIENCES",
        "YOGIC SCIENCES",
        "HINDUSTAN TIMES",
        "TECHNOLOGY TODAY",
        "TECHNOLOGY TODAY HINDUSTAN TIMES",
        "HINDUSTAN TIMES TECHNOLOGY TODAY",
        "HINDUSTAN TECHNOLOGY TODAY"
    ]
    
    # Run tests
    print("Word Combination Test Results:")
    for input_str in test_inputs:
        print(f"'{input_str}': {is_word_combination(input_str, word_set)}")

# Run the tests when the script is executed
if __name__ == "__main__":
    test_word_combination_checker()

Word Combination Test Results:
'JAN JAGRAN YOGIC SCIENCES': True
'DAINIK JAGRAN': True
'DAINIK JAGRAN JAN JAGRAN YOGIC SCIENCES': True
'JAN JAGRAN YOGIC SCIENCES DAINIK JAGRAN': True
'DAINIK YOGIC SCIENCES': False
'DAINIK JAN JAGRAN YOGIC SCIENCES': False
'YOGIC SCIENCES': False
'HINDUSTAN TIMES': True
'TECHNOLOGY TODAY': True
'TECHNOLOGY TODAY HINDUSTAN TIMES': True
'HINDUSTAN TIMES TECHNOLOGY TODAY': True
'HINDUSTAN TECHNOLOGY TODAY': False


### Vector DB version

In [None]:
import sqlalchemy as sa
from sqlalchemy import create_engine, text
from typing import List, Set

def fetch_column_from_database(
    connection_string: str, 
    table_name: str, 
    column_name: str = 'Title Name'
) -> List[str]:
    """
    Fetch a specific column from a database table.
    
    :param connection_string: SQLAlchemy database connection string
    :param table_name: Name of the table to query
    :param column_name: Name of the column to extract
    :return: List of values from the specified column
    """
    try:
        # Create engine
        engine = create_engine(connection_string)
        
        # Establish connection
        with engine.connect() as connection:
            # Construct query
            query = text(f"SELECT DISTINCT {column_name} FROM {table_name}")
            
            # Execute query
            result = connection.execute(query)
            
            # Fetch all values
            column_values = [row[0] for row in result if row[0] is not None]
        
        return column_values
    
    except Exception as e:
        print(f"Error fetching data from database: {e}")
        return []

def load_word_list(words_list: List[str]) -> Set[str]:
    """
    Convert the list of words into a set for efficient lookup.
    
    :param words_list: List of words or phrases to be used as a dictionary
    :return: Set of words/phrases for fast checking
    """
    return set(words_list)

def is_word_combination(input_string: str, word_set: Set[str]) -> bool:
    """
    Check if the input string is a combination of words from the word set.
    
    :param input_string: String to check
    :param word_set: Set of valid words/phrases
    :return: Boolean indicating if the string is a valid word combination
    """
    # Split the input string into words
    words = input_string.split()
    
    # Try all possible combinations of splitting the input
    def can_split(start: int, memo: dict = None) -> bool:
        # Memoization to optimize recursive calls
        if memo is None:
            memo = {}
        
        # Base case: reached the end of the string
        if start == len(words):
            return True
        
        # Check memoized results
        if start in memo:
            return memo[start]
        
        # Try all possible splits from this starting point
        for end in range(start + 1, len(words) + 1):
            # Get the current substring
            current_phrase = ' '.join(words[start:end])
            
            # Check if current phrase is in word set
            if current_phrase in word_set:
                # Recursively check the rest of the string
                if can_split(end, memo):
                    memo[start] = True
                    return True
        
        # No valid split found
        memo[start] = False
        return False
    
    # Attempt to split the entire input
    return can_split(0)

def test_word_combination_checker(
    connection_string: str, 
    table_name: str, 
    column_name: str = 'Title Name'
):
    """
    Test the word combination checker using titles from a database.
    
    :param connection_string: SQLAlchemy database connection string
    :param table_name: Name of the table to query
    :param column_name: Name of the column to extract
    """
    # Fetch titles from database
    title_names = fetch_column_from_database(
        connection_string, 
        table_name, 
        column_name
    )
    
    # Create an efficient word set
    word_set = load_word_list(title_names)
    
    # Test cases
    test_inputs = [
        "JAN JAGRAN YOGIC SCIENCES",
        "DAINIK JAGRAN",
        "DAINIK JAGRAN JAN JAGRAN YOGIC SCIENCES",
        "JAN JAGRAN YOGIC SCIENCES DAINIK JAGRAN",
        "DAINIK YOGIC SCIENCES",
        "DAINIK JAN JAGRAN YOGIC SCIENCES",
        "YOGIC SCIENCES"
    ]
    
    # Run tests
    print("Word Combination Test Results:")
    for input_str in test_inputs:
        print(f"'{input_str}': {is_word_combination(input_str, word_set)}")

# Example usage
if __name__ == "__main__":
    # Replace with your actual database connection string and table details
    # Examples:
    # PostgreSQL: "postgresql://username:password@host:port/database"
    # MySQL: "mysql+pymysql://username:password@host:port/database"
    # SQLite: "sqlite:///path/to/database.db"
    CONNECTION_STRING = "your_database_connection_string_here"
    TABLE_NAME = "your_table_name"
    COLUMN_NAME = "Title Name"  # Optional, defaults to 'Title Name'
    
    test_word_combination_checker(
        CONNECTION_STRING, 
        TABLE_NAME, 
        COLUMN_NAME
    )

In [None]:
import numpy as np
from typing import List, Set, Tuple, Optional

class VectorWordCombinationChecker:
    def __init__(self, vector_db_connection):
        """
        Initialize the word combination checker with vector database connection.
        
        :param vector_db_connection: Connection to your vector database
        """
        self.vector_db = vector_db_connection
        self.titles = []
        self.title_vectors = []
        self.similarity_threshold = 0.9  # Adjust based on your data
    
    def fetch_titles_and_vectors(
        self, 
        collection_name: str, 
        title_field: str = 'title', 
        vector_field: str = 'vector'
    ):
        """
        Fetch titles and their vectors from the vector database.
        
        :param collection_name: Name of the collection/table in vector DB
        :param title_field: Field name for titles
        :param vector_field: Field name for vectors
        """
        # Placeholder for vector DB query - replace with your specific vector DB method
        results = self.vector_db.query(
            collection_name=collection_name,
            query_vector=None,  # No specific query, fetch all
            include=[title_field, vector_field]
        )
        
        self.titles = [result[title_field] for result in results]
        self.title_vectors = [result[vector_field] for result in results]
    
    def compute_cosine_similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float:
        """
        Compute cosine similarity between two vectors.
        
        :param vec1: First vector
        :param vec2: Second vector
        :return: Cosine similarity score
        """
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
    
    def find_similar_phrases(
        self, 
        input_string: str, 
        max_combinations: int = 5
    ) -> List[Tuple[str, float]]:
        """
        Find similar phrases in the database for an input string.
        
        :param input_string: String to find combinations for
        :param max_combinations: Maximum number of combinations to return
        :return: List of similar phrases with their similarity scores
        """
        # Compute vector for input string - replace with your vector encoding method
        input_vector = self.encode_text(input_string)
        
        # Compute similarities
        similarities = [
            self.compute_cosine_similarity(input_vector, vec) 
            for vec in self.title_vectors
        ]
        
        # Sort and filter similar titles
        similar_titles = sorted(
            [(title, score) for title, score in zip(self.titles, similarities)],
            key=lambda x: x[1], 
            reverse=True
        )
        
        return similar_titles[:max_combinations]
    
    def is_word_combination(
        self, 
        input_string: str, 
        similarity_threshold: float = 0.9
    ) -> bool:
        """
        Check if the input string can be formed from database titles.
        
        :param input_string: String to check
        :param similarity_threshold: Minimum similarity to consider a match
        :return: Boolean indicating if string is a valid combination
        """
        # Split into words
        words = input_string.split()
        
        def can_split(start: int, memo: Optional[dict] = None) -> bool:
            if memo is None:
                memo = {}
            
            # Base case: reached end of string
            if start == len(words):
                return True
            
            # Check memoized results
            if start in memo:
                return memo[start]
            
            # Try all possible splits
            for end in range(start + 1, len(words) + 1):
                current_phrase = ' '.join(words[start:end])
                
                # Find similar phrases
                similar_phrases = self.find_similar_phrases(current_phrase)
                
                # Check if any similar phrase meets threshold
                if any(score >= similarity_threshold for _, score in similar_phrases):
                    # Recursively check rest of the string
                    if can_split(end, memo):
                        memo[start] = True
                        return True
            
            # No valid split found
            memo[start] = False
            return False
        
        return can_split(0)
    
    def encode_text(self, text: str) -> np.ndarray:
        """
        Encode text into a vector. 
        Replace this method with your specific text-to-vector encoding.
        
        :param text: Input text
        :return: Vector representation
        """
        # Placeholder - replace with your vector encoding method
        # This could be a pre-trained embedding model like BERT, Word2Vec, etc.
        raise NotImplementedError("Replace with your text encoding method")

def test_vector_word_combination():
    """
    Example test function demonstrating usage.
    """
    # Replace with your actual vector DB connection
    vector_db_connection = None  # Your vector DB connection
    
    checker = VectorWordCombinationChecker(vector_db_connection)
    
    # Fetch titles and vectors from your database
    checker.fetch_titles_and_vectors(
        collection_name='your_collection_name',
        title_field='title',
        vector_field='vector'
    )
    
    # Test cases
    test_inputs = [
        "JAN JAGRAN YOGIC SCIENCES",
        "DAINIK JAGRAN",
        "DAINIK JAGRAN JAN JAGRAN YOGIC SCIENCES",
        "JAN JAGRAN YOGIC SCIENCES DAINIK JAGRAN",
        "DAINIK YOGIC SCIENCES",
        "DAINIK JAN JAGRAN YOGIC SCIENCES",
        "YOGIC SCIENCES"
    ]
    
    # Run tests
    print("Vector Word Combination Test Results:")
    for input_str in test_inputs:
        result = checker.is_word_combination(input_str)
        print(f"'{input_str}': {result}")
        
        # Optional: Print similar phrases for debugging
        similar_phrases = checker.find_similar_phrases(input_str)
        print("Similar Phrases:", similar_phrases)

if __name__ == "__main__":
    test_vector_word_combination()