In [1]:
from langchain.tools import tool
from duckduckgo_search import DDGS
import re

@tool
def discover_infosys_competitors(target: str = "Virgin Media") -> list:
    """
    Discovers companies that have collaborated with the target company and are potential competitors to Infosys.

    Args:
        target: The company to investigate for collaborations (default is 'Virgin Media').

    Returns:
        A list of dictionaries containing 'company_name', 'link', and 'snippet'.
    """
    queries = [
        f'"{target}" partners with',
        f'"{target}" collaboration with',
         f'"{target}" strategic partnership'
    ]
    potential_competitors = set()
    results = []

    with DDGS() as ddgs:
        for query in queries:
            for r in ddgs.text(query, region="wt-wt", safesearch="off", max_results=2):
                snippet = r.get("body", "")
                # Simple regex to extract capitalized words (company names)
                companies = re.findall(r'\b[A-Z][a-zA-Z&.\s]{2,}\b', snippet)
                for company in companies:
                    company = company.strip()
                    if company.lower() not in [target.lower(), "Infosys".lower()] and company not in potential_competitors:
                        potential_competitors.add(company)
                        results.append({
                            "company_name": company,
                            "link": r.get("href", ""),
                            "snippet": snippet
                        })
    return results

In [2]:
result_list = discover_infosys_competitors.invoke({"target": "Virgin Media"})

In [5]:
len(result_list)

15

In [31]:
from langchain.chat_models import init_chat_model
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.document_loaders import WebBaseLoader
import os
from dotenv import load_dotenv 
load_dotenv()
import re
import time

def analyze_collaborators_with_llm(result_list: list, api_key: str) -> list:
    """
    Analyzes if the companies in the result list are actually collaborating with Virgin Media
    by loading and analyzing the web page content.
    
    Args:
        result_list: A list of dictionaries with 'company_name', 'link', and 'snippet'.
        api_key: Your Groq API key.
    
    Returns:
        A filtered list of companies with confirmed collaborations and details.
    """
    # Initialize the chat model using Groq

    assign_api_key = os.getenv("COMPETION_BOT_API_KEY")

    model = init_chat_model(
        model="qwen-qwq-32b",
        model_provider="groq",
        temperature=0.5,
        max_tokens=2000,
        api_key=assign_api_key
    )
    
    # Define the prompt template
    prompt = PromptTemplate.from_template(
        """You are an expert business analyst researching partnerships between companies.

        I need you to analyze this web page content to identify if there are any companies that are collaborating 
        with Virgin Media (as partners, vendors, service providers, etc.).

        Web Page Content:
        {context}

        Please follow these steps:
        1. Identify any company names mentioned in the content that appear to be partners or collaborators with Virgin Media
        2. For each company, determine if there's clear evidence they're working with Virgin Media
        3. If possible, identify what type of work they're doing with Virgin Media
        4. Note whether it's a current or past collaboration

        Format your response exactly like this:
        IDENTIFIED COMPANIES:
        - [Company Name 1]: [Current/Past] collaboration. [Brief description of work]
        - [Company Name 2]: [Current/Past] collaboration. [Brief description of work]
        
        If no companies are clearly identified as collaborators, respond with "No clear collaborators identified."
        """
    )
    
    # Create the LLM chain
    chains = create_stuff_documents_chain(model, prompt)
    
    confirmed_collaborators = []
    processed_urls = set()  # To avoid processing the same URL multiple times
    
    for source in result_list:
        url = source["link"]
        
        # Skip if we've already processed this URL
        if url in processed_urls:
            continue
            
        processed_urls.add(url)
        
        try:
            print(f"Loading content from: {url}")
            
            # Load the web page content
            loader = WebBaseLoader(url)
            documents = loader.load()
            full_text = documents[0].page_content
            
            # Use first 4000 characters to keep within token limits
            truncated_text = full_text[:4000]
            
            # Run the LLM chain
            print("Analyzing content with LLM...")
            response = chains.invoke({"context": documents })
            
            # Parse the results if companies were found
            if "No clear collaborators identified" not in response:
                # Extract company information using regex
                company_pattern = r'- (.*?)(?=\n|\Z)'
                companies_raw = re.findall(company_pattern, response)
                
                for company_info in companies_raw:
                    # Extract company name and details
                    name_match = re.match(r'(.*?):', company_info)
                    if name_match:
                        company_name = name_match.group(1).strip()
                        
                        # Check if it's current or past
                        if "Current" in company_info:
                            relationship_status = "Current"
                        elif "Past" in company_info:
                            relationship_status = "Past"
                        else:
                            relationship_status = "Unknown"
                        
                        # Extract description of work
                        work_description = re.sub(r'^.*?(Current|Past|Unknown).*?\.', '', company_info).strip()
                        
                        confirmed_collaborators.append({
                            "company_name": company_name,
                            "relationship_status": relationship_status,
                            "work_description": work_description,
                            "link": url,
                            "evidence": truncated_text[:200] + "..." # Short preview of evidence
                        })
                        
                        print(f"Found collaborator: {company_name} - {relationship_status}")
            else:
                print("No collaborators found on this page")
                
            # Be kind to APIs and web servers
            time.sleep(2)
            
        except Exception as e:
            print(f"Error processing {url}: {e}")
    
    return confirmed_collaborators

In [32]:
assign_api_key = os.getenv("COMPETION_BOT_API_KEY")
collaborators = analyze_collaborators_with_llm(result_list, assign_api_key)

Loading content from: https://www.virginmedia.com/partner-with-us
Analyzing content with LLM...
Error processing https://www.virginmedia.com/partner-with-us: Error code: 413 - {'error': {'message': 'Request too large for model `qwen-qwq-32b` in organization `org_01jgdznc1zfj393pp1khsg55jx` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Requested 17657, please reduce your message size and try again. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
Loading content from: https://www.virginmediapartners.com/vmmap
Analyzing content with LLM...
Found collaborator: **O2** - Current
Loading content from: https://www.virginmedia.com/corporate/sustainability/stories/creating-a-best-practice-charity-partnership
Analyzing content with LLM...
Found collaborator: **Scope** - Past
Found collaborator: **O2** - Past
Loading content from: https://www.virgin.com/about-virgin/latest/virgin-media-t

In [33]:
print(f"\nTotal confirmed collaborators: {len(collaborators)}")
for collab in collaborators:
        print(f"\nCompany: {collab['company_name']}")
        print(f"Status: {collab['relationship_status']}")
        print(f"Work: {collab['work_description']}")
        print(f"Source: {collab['link']}")


Total confirmed collaborators: 11

Company: **O2**
Status: Current
Work: O2 is partnered with Virgin Media to provide bundled products that include an O2 SIM, as indicated by shared use of customer data outlined in their joint privacy policies.
Source: https://www.virginmediapartners.com/vmmap

Company: **Scope**
Status: Past
Work: Strategic partnership focused on disability employment initiatives, including the Support to Work service and the #WorkWithMe campaign, aiming to improve employment opportunities for disabled individuals.
Source: https://www.virginmedia.com/corporate/sustainability/stories/creating-a-best-practice-charity-partnership

Company: **O2**
Status: Past
Work: Mentioned in the web page’s menu under "Shop O2 Plans," but not explicitly described as a partner in the partnership context discussed in the article. Likely a service provider or affiliate for mobile plans, not a charitable or strategic partner like Scope.
Source: https://www.virginmedia.com/corporate/sustai

In [35]:
company_names = [(item['company_name'], item['relationship_status']) for item in collaborators]

In [36]:
company_names

[('**O2**', 'Current'),
 ('**Scope**', 'Past'),
 ('**O2**', 'Past'),
 ('Spotify', 'Past'),
 ('8x8', 'Current'),
 ('**Sky**', 'Current'),
 ('**NBCUniversal**', 'Current'),
 ('**Channel 5**', 'Current'),
 ('**Fox International Channels**', 'Current'),
 ('**A+E Networks**', 'Current'),
 ('**Viacom**', 'Current')]