In [156]:

import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
import os
df = pd.read_csv("cleaned_data.csv")

In [157]:
load_dotenv(override=True)
api_key = os.getenv("OPENAI_API_KEY")
if(api_key[:8]=="sk-proj-"):
    print("Valid API key")

Valid API key


In [158]:
openai=OpenAI()

In [159]:
tag_generation_prompt = """You are a data extraction specialist creating annotation schemas for machine learning datasets.

Analyze the provided text content and generate 4 specific extraction labels that capture concrete, factual information present in the text.

REQUIREMENTS:
- Labels must extract specific entities, names, terms, or measurable data
- Avoid generic business categories or analytical interpretations
- Focus on information that would be valuable for training ML models
- Each label should target different types of factual content

GOOD LABEL EXAMPLES:
- "Company Names Mentioned" (extracts: "Tesla, SpaceX, Neuralink")
- "Specific Technologies" (extracts: "machine learning, blockchain, quantum computing") 
- "Funding Amounts" (extracts: "$10M Series A, $50M funding round")
- "Geographic Locations" (extracts: "Silicon Valley, New York, London")
- "Job Titles Referenced" (extracts: "CEO, Data Scientist, Product Manager")
- "Product Features" (extracts: "real-time analytics, automated reporting")

BAD LABEL EXAMPLES:
- "Market Trends" (too analytical)
- "Industry Analysis" (too broad)
- "Future Outlook" (subjective interpretation)
- "General Insights" (vague)

DOMAIN-SPECIFIC GUIDANCE:
- For startup/business content: Focus on company names, funding, locations, technologies
- For technical content: Emphasize specific tools, methodologies, metrics, standards
- For industry reports: Target companies, market sizes, growth rates, key players
- For job-related content: Extract skills, roles, companies, requirements

OUTPUT FORMAT:
Return exactly 4 labels as comma-separated strings with no additional text, explanations, or formatting.

Example output: "Startup Names, Funding Rounds, Technology Stack, Geographic Markets"
"""


In [160]:
def call_llm_for_tags(text):
    prompt = f"Identify 3-4 relevant annotation labels for the domain from this text:\n\n{text}"
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role":"system","content":tag_generation_prompt},
            {"role":"user","content":prompt}]
    )
    raw_response = response.choices[0].message.content.strip()
    raw_response = raw_response.strip('[]"\'')
    tags = [tag.strip(' "\'') for tag in raw_response.split(',')]
    return tags

In [161]:
content_annotation_prompt = """You are a precision data extractor for machine learning training datasets.

Given a text and a single extraction label, identify and extract the exact factual information requested by that label from the content. Return concrete, specific values found in the text.

EXTRACTION PRINCIPLES:
1. ONLY extract information explicitly mentioned in the text
2. Use exact names, terms, and phrases from the source material
3. If multiple items exist for the label, separate them with commas
3. Prioritize proper nouns, specific numbers, and concrete terms over generic descriptions

FORMATTING RULES:
- Return only the extracted value(s) with no label prefix
- Keep extracted values concise but specific
- Maintain original capitalization from source text
- No explanations, headers, or additional formatting

EXTRACTION EXAMPLES:

Text: "Tesla raised $2B in Series C funding to expand manufacturing in Texas and California"
Label: "Company Names"
Output: "Tesla"

Label: "Funding Amounts" 
Output: "$2B Series C"

Label: "Geographic Locations"
Output: "Texas, California"

Text: "The startup uses React, Node.js, and MongoDB to build real-time analytics dashboards"
Label: "Technology Stack"
Output: "React, Node.js, MongoDB"

Label: "Product Features"
Output: "real-time analytics dashboards"


QUALITY CHECKS:
- Verify each extracted value appears in the original text
- Avoid paraphrasing - use exact wording when possible
- Don't infer information not explicitly stated
- Don't add context or explanations
- Don't use generic business terms unless they appear in the text

Remember: Extract facts, not interpretations. Return only the specific information requested by the single label provided."""

In [162]:
from io import StringIO


def call_llm_for_annotation(text, tags):
    user_prompt = f"Tags: {tags}\n\nText:\n{text}"
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role":"system","content":content_annotation_prompt},
            {"role":"user","content":user_prompt}
        ]
    )
    
    return response.choices[0].message.content.strip()
    


In [166]:
tags = call_llm_for_tags(df.iloc[0:10]['cleaned_text'])

In [167]:
tags

['Company Names Mentioned',
 'Market Size Estimates',
 'Industry Reports',
 'Economic Trends']

In [168]:
for i in tags:
    df[f"{i}"]=df['cleaned_text'].apply(lambda x: call_llm_for_annotation(x, i))

In [169]:
df.head()

Unnamed: 0,url,title,cleaned_text,Company Names Mentioned,Market Size Estimates,Industry Reports,Economic Trends
0,https://enterpriseleague.com/blog/engineering-...,26 engineering startups that can change the wo...,"During the past twenty years, significant chan...","Virtual Facility, nTopology, Arch Engineers, C...",,"Virtual Facility, nTopology, Arch Engineers, C...","Virtual Facility, nTopology, Arch Engineers, C..."
1,https://www.pwc.com/us/en/industries/industria...,Engineering and construction industry trends: ...,The engineering and construction (E&C) sector ...,"E&C, Donald J. Trump, Inflation Reduction Act ...","$10.2 trillion, $15.2 trillion","$10.2 trillion, $15.2 trillion, 2020, 2030, In...","US$10.2 trillion, $15.2 trillion, 2020, 2030"
2,https://www.mordorintelligence.com/industry-re...,"US Engineering Services Industry - Size, Trend...",The United States Engineering Services Market ...,"United States Engineering Services Market, Inf...","USD 175.21 billion in 2025, USD 271.01 billion...","USD 175.21 billion, USD 271.01 billion, 9.12%,...","USD 175.21 billion, USD 271.01 billion, 9.12%,..."
3,https://online-engineering.case.edu/blog/innov...,Innovative Engineering Practices Driving Indus...,Engineers spearhead new developments that are ...,Case Western Reserve University,,"Mechanical Engineering, Industry 4.0, Internet...","Industry 4.0, Internet of Things (IoT), cloud ..."
4,https://www.forbes.com/sites/haniyarae/2025/03...,Meet America’s Best Startup Employers 2025,Inside the lab at Lunar Energy’s headquarters ...,"Lunar Energy, Cribl, Wiz, Hightouch, Anthropic",No data found,"Lunar Energy, 2020, California, London, No. 4,...","Lunar Energy, 2020, California, London, No. 4,..."


In [174]:
dataquality=(df['Company Names Mentioned'].nunique()/len(df))*100
dataquality

80.0

In [175]:
df.to_csv("annotated_data.csv", index=False)