# Automated Company Research 🧊

##### 💡 **Research Areas:** Rapid Prototyping, Generative AI, Information Retrieval, Langchain Basics, AI-assisted Market Research, Entrepreneurial Hacking.

#### This is a simple prototype for automated company research. We wanted a tool to boost our client outreach process, either save time, or come up with better ideas. We built this prototype to gather valuable information from the public internet about companies to help us understand their business and develop better solutions for them.

## 🔥 Powered by LangChain, Jina AI & Google Gemini!

<div style="display:flex; align-items:center; padding: 50px;">
<p style="margin-right:10px;">
    <img height="300px" style="width:auto;" width="200px" src="https://avatars.githubusercontent.com/u/192148546?s=400&u=95d76fbb02e6c09671d87c9155f17ca1e4ef8f21&v=4"> 
</p>
</div>


In [8]:
import os

requirements_installed = False
max_retries = 3
retries = 0


def install_requirements():
    """Installs the requirements from requirements.txt file"""
    global requirements_installed
    global retries
    global max_retries
    if requirements_installed:
        print("Requirements already installed.")
        return

    print("Installing requirements...")
    install_status = os.system("pip install -r requirements.txt")
    if install_status == 0:
        print("Requirements installed successfully.")
        requirements_installed = True
    else:
        print("Failed to install requirements.")
        if retries < max_retries:
            print("Retrying...")
            retries += 1
            return install_requirements()
        exit(1)
    return

In [None]:
install_requirements()

In [10]:
from dotenv import load_dotenv
import os


def setup_env():
    """Sets up the environment variables"""

    def check_env(env_var):
        value = os.getenv(env_var)
        if value is None:
            print(f"Please set the {env_var} environment variable.")
            exit(1)
        else:
            print(f"{env_var} is set.")

    load_dotenv()

    variables_to_check = ["OPENAI_API_KEY", "GEMINI_API_KEY", "SERPER_API_KEY"]

    for var in variables_to_check:
        check_env(var)

In [None]:
setup_env()

In [12]:
from langchain_community.utilities import GoogleSerperAPIWrapper
from openai import OpenAI
import json


def simple_serper_openai_search(
    query="What is the current latest news in New York?", verbose=False
):
    """Simple function to search for a query using Google Serper API and summarize the results using OpenAI API"""
    search = GoogleSerperAPIWrapper()
    openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    search_results = search.results(query)
    search_results_json = json.dumps(search_results, indent=4)
    if verbose:
        print("Search Results Found:\n", search_results_json)
    search_summary = openai.chat.completions.create(
        messages=[
            {"role": "system", "content": "You are a summarizer of search results."},
            {
                "role": "user",
                "content": f"""Given a JSON of search results, neatly format the data into a summary. 
             The search results are as follows: {search_results_json}. 
             Don't miss out any details, make sure everything in the JSON is included in a human readable format.
             Don't hallucinate or make up factual information.""",
            },
        ],
        model="gpt-4o",
    )
    result = search_summary.choices[0].message.content
    if verbose:
        print("Summary of Search Results:\n", result)
    return result

In [None]:
from IPython.display import Markdown, display, clear_output

query = "What is the latest news in California?"
response = simple_serper_openai_search(query=query, verbose=True)
markdown = Markdown(f"### {query}\n{response}")
clear_output()
display(markdown)

In [15]:
import requests
import json


def jina_search(query):
    """Searches the query using Jina AI API"""
    url = f"https://s.jina.ai/{query}"

    headers = {
        "Authorization": "Bearer jina_518eb161d77c4ed8ae0fa50025b1f30bHMfArtRdFm6F9_edQZJ5KbWjw3xW",
        "X-Retain-Images": "none",
        "X-Return-Format": "markdown",
    }

    response = requests.get(url, headers=headers)
    return response.text


def regular_search(query):
    """Searches the query using Google Serper API"""
    search = GoogleSerperAPIWrapper()
    response = search.results(query)
    return response


def dedupe_links(links):
    """Dedupes the links"""
    return list(set(links))


def collect_links(response: str):
    """Collects the links from the response"""
    organic = response["organic"]
    links = []
    for item in organic:
        sitelinks = item.get("sitelinks")
        if sitelinks or (type(sitelinks) == list and len(sitelinks) > 0):
            for link in sitelinks:
                links.append(link["link"])
        links.append(item["link"])
    return dedupe_links(links)

In [24]:
## Link caching to save API credits
linksCache = {}
links_cache_file = "outputs/links_cache.json"


def load_links_cache():
    """Loads the links cache from the file"""
    with open(links_cache_file, "r") as f:
        linksCache = json.load(f)
    return linksCache


def save_links_cache():
    """Saves the links cache to the file"""
    with open(links_cache_file, "w") as f:
        json.dump(linksCache, f)


def add_links_to_cache(query, links):
    """Adds the links to the cache"""
    linksCache[query] = links
    save_links_cache()


def get_links_from_regular_search(query):
    """Gets the links from the regular search"""
    ## check if file exists

    try:
        linksCache = load_links_cache()
    except:
        linksCache = {}

    cached_links = linksCache.get(query)
    if cached_links:
        return cached_links
    response = regular_search(query)
    links = collect_links(response)
    add_links_to_cache(query, links)
    return links

In [None]:
import google.generativeai as genai
import os

gemini_api_key = os.getenv("GEMINI_API_KEY")
DEFAULT_GEMINI_MODEL = "gemini-1.5-flash"


def get_gemini_client(model=DEFAULT_GEMINI_MODEL):
    genai.configure(api_key=gemini_api_key)
    model = genai.GenerativeModel(model_name=model)
    return model


def gemini(query, model=DEFAULT_GEMINI_MODEL):
    model = get_gemini_client(model)
    response = model.generate_content(query)
    return response.text

In [18]:
def run_basic_reserach(company):
    """Runs basic research on the company."""
    queries = [
        f"What is {company}?",
        f"What are the top goals of {company}?",
        f"Who are the founders of {company}?",
        f"Who are the competitors of {company}?",
        f"What is the latest news about {company}?",
    ]

    raw_markdown = ""

    for query in queries:
        print(f"Searching for: {query}")
        search_results = jina_search(query)
        title = f"### Search results for {query}"
        raw_markdown += title + "\n" + search_results + "\n"
        print("Updated results.")
    return raw_markdown

In [19]:
from IPython.display import Markdown, clear_output
from datetime import datetime
import math

RESEARCH_REPORT_CREATION_PROMPT = """
    I'm giving you the raw research notes of {company}.
    Your job is to generate a compact, 5 page research report in markdown. 
    Label the page numbers, mark the sections well. 
    Make sure it is professional, factual and well structured.
    Don't hallucinate or make up information.
    Make sure each page has atleast 4 sections with 2 paragraphs each or atleast 10 bullet points.
    Filter only the information relevant to {company} and discard the rest.
    This is a "Company Research Report" for Responsible AI stakeholders and Global Hacker Groups that are Legal & Ethical.

    Raw Research Notes: '{basic_research}'
    """


def create_reserach_report(company: str, basic_research: str, output_file=None):
    """Creates a research report for the company"""
    prompt = RESEARCH_REPORT_CREATION_PROMPT.format(
        company=company, basic_research=basic_research
    )
    print("Generating research report...")
    research_report = gemini(prompt)
    print("Research report generated.")
    clear_output()

    with open(output_file, "w") as f:
        f.write(research_report)
    return research_report


def generate_research_report(company: str, output_file=None):
    """Generates a research report for the company"""
    if not output_file:
        unix_timestamp = math.floor(datetime.now().timestamp())
        output_file = f"{company}_research_report_${unix_timestamp}.md"
    print(f"Running basic research on {company}...")
    basic_research = run_basic_reserach(company=company)
    print(f"Basic research completed. {len(basic_research)} characters found.")
    return Markdown(create_reserach_report(company, basic_research, output_file))

In [20]:
from datetime import datetime
import traceback


def generate_one_company_report(company: str):
    """Generates a research report for one company"""
    total_time = 0
    try:
        company_start_time = datetime.now()
        print(f"Generating research report for {company}...")
        report = generate_research_report(
            company,
            output_file=f"data/company_research/${company}_research_report_090825.md",
        )
        company_end_time = datetime.now()
        company_total_time_seconds = (company_end_time - company_start_time).seconds
        total_time += company_total_time_seconds
        print(f"Research report for {company} generated.")
        print(f"Time taken: {company_total_time_seconds} seconds")
        return report
    except Exception as e:
        print(f"Failed to generate research report for {company}.")
        traceback.print_exc()
        return None


def generate_batch_company_reports(companies: list):
    """Generates research reports for a batch of companies"""
    print(f"Generating research reports for {len(companies)} companies.")
    total_time = 0
    for company in companies:
        try:
            company_start_time = datetime.now()
            print(f"Generating research report for {company}...")
            generate_research_report(
                company,
                output_file=f"data/company_research/${company}_research_report_090825.md",
            )
            company_end_time = datetime.now()
            company_total_time_seconds = (company_end_time - company_start_time).seconds
            total_time += company_total_time_seconds
            print(f"Research report for {company} generated.")
            print(f"Time taken: {company_total_time_seconds} seconds")
        except Exception as e:
            print(f"Failed to generate research report for {company}.")
            print(f"Error: {e}")
    print(f"Total time taken: {total_time} seconds.")

In [None]:
## Example: One Company Report

from IPython.display import Markdown, display, clear_output

company = "Vercel"

report = generate_one_company_report(company)

# clear_output()
display(report)

In [17]:
## Example: Run reports on a batch of companies (commented to save API credits)

# batch_output_report = "outputs/batch_company_research_report_01.md"

# companies = [
#     "Google",
#     "Deloitte India",
#     "Zepto",
#     "Dyte",
# ]

# generate_batch_company_reports(companies)

In [22]:
import requests
import traceback
from markdownify import markdownify as md
from uuid import uuid4

DEFAULT_REQUEST_TIMEOUT = 30


def fetch_page_markdown(url, timeout=DEFAULT_REQUEST_TIMEOUT):
    try:
        response = requests.get(url, timeout=timeout)
        response.raise_for_status()
        html = response.text
        markdown = md(html)
        return markdown
    except Exception as e:
        print(f"Failed to fetch page text for {url}.")
        traceback.print_exc()
        return ""


def check_file_exists(file):
    """Checks if the file exists"""
    return os.path.exists(file)


def update_temp_file(temp_file, markdown):
    """Updates the temp file with the markdown"""
    if not check_file_exists(temp_file):
        print(f"Temp file {temp_file} does not exist.")
        print(f"Creating temp file: {temp_file}")
        with open(temp_file, "w") as f:
            f.write(markdown)
    with open(temp_file, "a") as f:
        f.write(markdown)


def crawl_links_and_get_markdown(links):
    """Crawls the links and gets the markdown"""
    temp_file = f"outputs/temp_{uuid4()}.md"
    print(f"Rolling write to temp file: {temp_file}")
    markdown = ""
    for link in links:
        print(f"Crawling link: {link}")
        response = fetch_page_markdown(link) + "\n"
        title = f"### PAGE: {link}\n"
        current_markdown = title + response
        update_temp_file(temp_file, current_markdown)
        print(f"Temp file {temp_file} updated.")
        markdown += title + response
    return markdown


def company_deep_research(company: str) -> Markdown:
    """Does a deep research on the company."""
    link_exploration_queries = [
        f"What is {company}?",
        f"Who are the founders of {company}?",
        f"Who are the competitors of {company}?",
        f"What is the latest news about {company}?",
        f"What are the top goals of {company}?",
        f"What is {company}'s business model?",
        f"What is {company} known for",
        f"Who are some of {company}'s customers?",
        f"What are some employee reviews of {company}?",
        f"Where is the headquarters of {company}?",
        f"What industry does {company} operate in?",
        f"What are some of the products of {company}?",
    ]

    all_links = []

    for query in link_exploration_queries:
        print(f"Exploring links for: {query}")
        links = get_links_from_regular_search(query)
        all_links.extend(links)

    raw_markdown = crawl_links_and_get_markdown(all_links)
    unix_timestamp = math.floor(datetime.now().timestamp())
    research_report = create_reserach_report(
        company,
        raw_markdown,
        output_file=f"data/company_research/{company}_research_report_${unix_timestamp}_naive.md",
    )
    Markdown(research_report)

In [None]:
# Deep Research: 🚀

from IPython.display import display_markdown

company = "Databricks"

display_markdown(company_deep_research(company))