### USAJOBS API Script

This script contains the full API call to **USAJOBS** and the logic for generating labeled data used by the NLP classification model.

#### ⚠️ Setup Instructions Before Running

- **Update file paths**:  
  Use `Ctrl + F` to find and replace all instances of  
  `C://Users//...//` with your local directory path.

- **Insert your API credentials**:  
  Replace the placeholder `Authorization-Key` with your own key from [developer.usajobs.gov](https://developer.usajobs.gov).  
  Ensure the **email address** used in the API header matches the one registered with your key.

This script prepares raw job data and labeled examples for training the data buyer classification model.


In [None]:
import requests
import json
import pandas as pd
from datetime import datetime

In [None]:
import requests
import json
import pandas as pd
from datetime import datetime

# API setup
url = 'https://data.usajobs.gov/api/search'
headers = {
    'Host': 'data.usajobs.gov',
    'User-Agent': 'email@email.com',
    'Authorization-Key': 'authorization key'
}

# List of keywords to search for
keywords = ['data', 'contract', 'analyst', 'machine learning', 'marketing', 'aquisition', 'finance', 'security','tech', 'purchasing', 'statistics', 'math', 'data scientist', 'research', 'economist']

# Dictionary to collect all unique jobs
all_jobs = {}

for keyword in keywords:
    print(f"\nSearching for keyword: {keyword}")
    params = {
        'Keyword': keyword,
        'ResultsPerPage': 500,
        'Page': 1
    }

    while True:
        response = requests.get(url, headers=headers, params=params)

        if response.status_code != 200:
            print(f"Error {response.status_code}: {response.text}")
            break

        data = response.json()
        jobs = data.get('SearchResult', {}).get('SearchResultItems', [])
        if not jobs:
            break

        for job in jobs:
            job_id = job.get('MatchedObjectId')
            descriptor = job.get('MatchedObjectDescriptor', {})
            details = descriptor.get('UserArea', {}).get('Details', {})

            if job_id not in all_jobs:
                all_jobs[job_id] = {
                    'JobID': job_id,
                    'JobTitle': descriptor.get('PositionTitle'),
                    'JobDescription': details.get('JobSummary'),
                    'KeyDuties': details.get('MajorDuties', 'N/A'),
                    'Department': descriptor.get('OrganizationName'),
                    'Agency': descriptor.get('DepartmentName'),
                    'SearchKeywords': [keyword]  # First time seeing this job
                }
            else:
                if keyword not in all_jobs[job_id]['SearchKeywords']:
                    all_jobs[job_id]['SearchKeywords'].append(keyword)

        print(f"Retrieved page {params['Page']} with {len(jobs)} jobs")
        params['Page'] += 1

# Convert to DataFrame
jobs_df = pd.DataFrame(list(all_jobs.values()))

# Convert list of keywords to comma-separated string
jobs_df['SearchKeywords'] = jobs_df['SearchKeywords'].apply(lambda x: ', '.join(x))

# Save to CSV
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
csv_file_path = f'C://Users//...//all_keyword_job_listings.csv'
jobs_df.to_csv(csv_file_path, index=False)

print(f"\nSaved {len(jobs_df)} unique job listings to {csv_file_path}")


In [None]:
# Path to the CSV file
csv_file_path = 'C://Users//...//all_keyword_job_listings.csv'

# Read CSV into a DataFrame
df = pd.read_csv(csv_file_path)
df.info()

In [None]:
import pandas as pd
import re

# Combine JobDescription and KeyDuties into one text field for analysis
df['CombinedText'] = (df['JobDescription'].fillna('') + ' ' + df['KeyDuties'].fillna('')).str.lower()

# Define phrases related to data purchasing/acquisition
related_phrases = [
    "data acquisition", "data procurement", "procure data", "purchase data",
    "buy data", "acquiring data", "data sourcing", "data licensing", 
    "external data acquisition", "third-party data", "data vendor", 
    "data provider", "data contracts", "contracting data", "data subscriptions",
    "vendor management", "external data", "commercial data"
]

# Build regex pattern
pattern = '|'.join([re.escape(phrase) for phrase in related_phrases])

# Create binary column: 1 if any phrase found, 0 otherwise
df['IsDataBuyer'] = df['CombinedText'].str.contains(pattern, case=False, na=False).astype(int)

# Show label distribution
print("Label distribution (IsDataBuyer):")
print(df['IsDataBuyer'].value_counts())

# Display jobs that mention relevant phrases
matches = df[df['IsDataBuyer'] == 1]
print(f"\nFound {len(matches)} job(s) mentioning data acquisition or purchasing concepts.\n")

# Display only selected columns (check for existence)
columns_to_show = ['JobID', 'JobTitle', 'Department', 'Agency', 'SearchKeywords']
display(matches[[col for col in columns_to_show if col in matches.columns]])


In [None]:
df['IsDataBuyer'] = df['CombinedText'].str.contains(pattern, case=False, na=False).astype(int)


In [None]:
from rapidfuzz import fuzz, process

# Define your list of signal phrases again
signal_phrases = [
    # Existing
    "data acquisition", "data procurement", "procure data", "purchase data",
    "buy data", "acquiring data", "data sourcing", "data licensing", 
    "external data", "third-party data", "data vendor", 
    "data provider", "data contracts", "contracting data", "data subscriptions",
    "vendor management", "commercial data",

    # New additions
    "data assets", "data commercialization", "procurement of data", "data",
    "external data sources", "data aggregators", "data monetization",
    "sourcing external data", "partner data", "data purchasing agreements",
    "data ingestion", "subscription data", "data acquisition strategy",
    "data buying", "external datasets", "external partnerships", "data sharing agreements",
    "data acquisition channels", "third-party data sources", "sourcing data providers",
    "managing data vendors", "data reseller", "external data vendors", "contracted data"
]


# Function to check if any phrase matches fuzzily above a threshold
def fuzzy_match_phrases(text, phrases, threshold=95):
    for phrase in phrases:
        score = fuzz.partial_ratio(phrase.lower(), text.lower())
        if score >= threshold:
            return phrase  # Return the matching phrase
    return None

# Apply fuzzy matching
df['FuzzyMatchedPhrase'] = df['CombinedText'].apply(lambda x: fuzzy_match_phrases(x, signal_phrases, threshold=80))
df['IsFuzzyMatch'] = df['FuzzyMatchedPhrase'].notnull().astype(int)

# Filter and display fuzzy matches
fuzzy_matches = df[df['IsFuzzyMatch'] == 1]
display(fuzzy_matches[['JobID', 'JobTitle', 'FuzzyMatchedPhrase', 'Department', 'Agency']])


In [None]:
# Combine both binary indicators into a single label
df['IsLikelyDataBuyer'] = ((df['IsDataBuyer'] == 1) | (df['IsFuzzyMatch'] == 1)).astype(int)

# View matching jobs
combined_matches = df[df['IsLikelyDataBuyer'] == 1]
display(combined_matches[['JobID', 'JobTitle', 'FuzzyMatchedPhrase', 'IsDataBuyer', 'IsFuzzyMatch', 'Department', 'Agency']])


In [None]:
large_agencies = [
    "Department of Defense", "Department of Veterans Affairs", "Department of the Treasury",
    "Department of Homeland Security", "Department of Health and Human Services",
    "Department of Justice", "Department of the Army"
]

medium_agencies = [
    "Department of Transportation", "Department of Commerce", "Department of Agriculture",
    "Department of Energy", "Department of the Interior", "National Aeronautics and Space Administration"
]

# Any agency not listed above will be considered "Small"
def classify_agency_size(agency):
    if agency in large_agencies:
        return 'Large'
    elif agency in medium_agencies:
        return 'Medium'
    else:
        return 'Small'

# Apply classification
df['AgencySize'] = df['Agency'].apply(classify_agency_size)


In [None]:
# Define function to classify each job
def classify_industry(row):
    text = f"{row['JobTitle']} {row['Department']} {row['SearchKeywords']}".lower()
    
    if any(x in text for x in ['finance', 'financial', 'account', 'budget']):
        return 'Finance'
    elif any(x in text for x in ['marketing', 'communications', 'advertising']):
        return 'Marketing'
    elif any(x in text for x in ['medical', 'pharmacy', 'nurse', 'health', 'clinical']):
        return 'Medical'
    elif any(x in text for x in ['cyber', 'security', 'information technology', 'it', 'data scientist', 'software', 'tech']):
        return 'Security/Tech'
    elif any(x in text for x in ['policy', 'regulation', 'legislative', 'analyst', 'compliance']):
        return 'Policy'
    else:
        return 'Other'

# Apply classification
df['Industry'] = df.apply(classify_industry, axis=1)


In [None]:
# Add a binary flag for senior roles
senior_keywords = ['senior', 'lead', 'chief', 'principal', 'director', 'head']

df['IsSeniorRole'] = df['JobTitle'].str.lower().str.contains('|'.join(senior_keywords), na=False)


In [None]:
# 1. Define what counts as a data-related job
data_keywords = ['data', 'analyst', 'scientist', 'analytics', 'it', 'information', 'statistician', 'intelligence']

# 2. Create flag for explicitly data-related job titles
df['IsExplicitDataJob'] = df['JobTitle'].str.lower().str.contains('|'.join(data_keywords), na=False).astype(int)


In [None]:
# Step 1: Define use case keywords
use_case_keywords = {
    'Fraud': ['fraud', 'eligibility', 'verification', 'audit', 'compliance'],
    'Sentiment': ['sentiment', 'public opinion', 'media monitoring', 'engagement', 'communication'],
    'PatientMatching': ['patient match', 'interoperability', 'record linkage', 'ehr', 'health record'],
    'AdTargeting': ['audience segmentation', 'targeting', 'ad performance', 'campaign data']
}

# Step 2: Create use case flags
for use_case, keywords in use_case_keywords.items():
    pattern = '|'.join(keywords)
    df[f'UseCase_{use_case}'] = df['CombinedText'].str.lower().str.contains(pattern, na=False).astype(int)


In [None]:
# Handle missing categories
df['Industry'] = df['Industry'].fillna('Other')
df['AgencySize'] = df['AgencySize'].fillna('Unknown')

In [None]:
# Your reference generalist titles
generalist_titles = [
    'Contract Specialist',
    'Grants Officer',
    'Grants Specialist',
    'Budget Officer',
    'Administrative Officer',
    'Operations Coordinator',
    'Program Coordinator',
    'Project Coordinator',
    'Procurement Specialist',
    'Procurement Analyst',
    'Communications Specialist',
    'Public Affairs Officer',
    'Public Information Officer',
    'Community Outreach Coordinator',
    'Health IT Coordinator',
    'Program Specialist',
    'Program Manager',
    'Business Operations Specialist'
]


# Function to determine if a job title is a generalist (fuzzy matched)
def is_generalist(title, threshold=65):
    match, score, _ = process.extractOne(title, generalist_titles, scorer=fuzz.partial_ratio)
    return score >= threshold

# Apply the fuzzy matching to classify generalist roles
df['IsGeneralistRole'] = df['JobTitle'].apply(lambda x: is_generalist(str(x)))


In [None]:
# Save full annotated DataFrame to CSV (includes all jobs + labels)
df.to_csv('C://Users//...//all_jobs_with_data_buyer_labels.csv', index=False)
print("Saved full job listings with data buyer indicators to: all_jobs_with_data_buyer_labels.csv")
