In [13]:
import requests
import json
import pandas as pd
from datetime import datetime

# API setup
url = 'https://data.usajobs.gov/api/search'
headers = {
    'Host': 'data.usajobs.gov',
    'User-Agent': 'email',
    'Authorization-Key': '...'
}

# List of keywords to search for
keywords = ['data', 'contract', 'analyst', 'machine learning', 'marketing', 'aquisition', 'finance', 'security','tech', 'purchasing', 'statistics', 'math', 'data scientist', 'research', 'economist']

# Dictionary to collect all unique jobs
all_jobs = {}

for keyword in keywords:
    print(f"\nSearching for keyword: {keyword}")
    params = {
        'Keyword': keyword,
        'ResultsPerPage': 500,
        'Page': 1
    }

    while True:
        response = requests.get(url, headers=headers, params=params)

        if response.status_code != 200:
            print(f"Error {response.status_code}: {response.text}")
            break

        data = response.json()
        jobs = data.get('SearchResult', {}).get('SearchResultItems', [])
        if not jobs:
            break

        for job in jobs:
            job_id = job.get('MatchedObjectId')
            descriptor = job.get('MatchedObjectDescriptor', {})
            details = descriptor.get('UserArea', {}).get('Details', {})

            if job_id not in all_jobs:
                all_jobs[job_id] = {
                    'JobID': job_id,
                    'JobTitle': descriptor.get('PositionTitle'),
                    'JobDescription': details.get('JobSummary'),
                    'KeyDuties': details.get('MajorDuties', 'N/A'),
                    'Department': descriptor.get('OrganizationName'),
                    'Agency': descriptor.get('DepartmentName'),
                    'SearchKeywords': [keyword]  # First time seeing this job
                }
            else:
                if keyword not in all_jobs[job_id]['SearchKeywords']:
                    all_jobs[job_id]['SearchKeywords'].append(keyword)

        print(f"Retrieved page {params['Page']} with {len(jobs)} jobs")
        params['Page'] += 1

# Convert to DataFrame
jobs_df = pd.DataFrame(list(all_jobs.values()))

# Convert list of keywords to comma-separated string
jobs_df['SearchKeywords'] = jobs_df['SearchKeywords'].apply(lambda x: ', '.join(x))

# Save to CSV
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
csv_file_path = f'C://Users//...all_keyword_job_listings.csv'
jobs_df.to_csv(csv_file_path, index=False)

print(f"\nSaved {len(jobs_df)} unique job listings to {csv_file_path}")



Searching for keyword: data
Retrieved page 1 with 500 jobs
Retrieved page 2 with 500 jobs
Retrieved page 3 with 500 jobs
Retrieved page 4 with 500 jobs
Retrieved page 5 with 500 jobs
Retrieved page 6 with 489 jobs

Searching for keyword: contract
Retrieved page 1 with 500 jobs
Retrieved page 2 with 500 jobs
Retrieved page 3 with 77 jobs

Searching for keyword: analyst
Retrieved page 1 with 140 jobs

Searching for keyword: machine learning
Retrieved page 1 with 8 jobs

Searching for keyword: marketing
Retrieved page 1 with 85 jobs

Searching for keyword: aquisition
Retrieved page 1 with 3 jobs

Searching for keyword: finance
Retrieved page 1 with 97 jobs

Searching for keyword: security
Retrieved page 1 with 500 jobs
Retrieved page 2 with 500 jobs
Retrieved page 3 with 500 jobs
Retrieved page 4 with 218 jobs

Searching for keyword: tech
Retrieved page 1 with 500 jobs
Retrieved page 2 with 475 jobs

Searching for keyword: purchasing
Retrieved page 1 with 83 jobs

Searching for keyword: 

In [22]:
# Path to the CSV file
csv_file_path = 'C://Users//...all_keyword_job_listings.csv'

# Read CSV into a DataFrame
df = pd.read_csv(csv_file_path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5829 entries, 0 to 5828
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   JobID           5829 non-null   int64 
 1   JobTitle        5829 non-null   object
 2   JobDescription  5827 non-null   object
 3   KeyDuties       5829 non-null   object
 4   Department      5829 non-null   object
 5   Agency          5829 non-null   object
 6   SearchKeywords  5829 non-null   object
dtypes: int64(1), object(6)
memory usage: 318.9+ KB


In [23]:
import pandas as pd
import re

# Combine JobDescription and KeyDuties into one text field for analysis
df['CombinedText'] = (df['JobDescription'].fillna('') + ' ' + df['KeyDuties'].fillna('')).str.lower()

# Define phrases related to data purchasing/acquisition
related_phrases = [
    "data acquisition", "data procurement", "procure data", "purchase data",
    "buy data", "acquiring data", "data sourcing", "data licensing", 
    "external data acquisition", "third-party data", "data vendor", 
    "data provider", "data contracts", "contracting data", "data subscriptions",
    "vendor management", "external data", "commercial data"
]

# Build regex pattern
pattern = '|'.join([re.escape(phrase) for phrase in related_phrases])

# Create binary column: 1 if any phrase found, 0 otherwise
df['IsDataBuyer'] = df['CombinedText'].str.contains(pattern, case=False, na=False).astype(int)

# Show label distribution
print("Label distribution (IsDataBuyer):")
print(df['IsDataBuyer'].value_counts())

# Display jobs that mention relevant phrases
matches = df[df['IsDataBuyer'] == 1]
print(f"\nFound {len(matches)} job(s) mentioning data acquisition or purchasing concepts.\n")

# Display only selected columns (check for existence)
columns_to_show = ['JobID', 'JobTitle', 'Department', 'Agency', 'SearchKeywords']
display(matches[[col for col in columns_to_show if col in matches.columns]])


Label distribution (IsDataBuyer):
IsDataBuyer
0    5817
1      12
Name: count, dtype: int64

Found 12 job(s) mentioning data acquisition or purchasing concepts.



Unnamed: 0,JobID,JobTitle,Department,Agency,SearchKeywords
188,820113300,ELECTRICAL ENGINEER,U.S. Army Corps of Engineers,Department of the Army,"data, security"
518,834502500,School Support Assistant,Department of Defense Education Activity,Department of Defense,"data, security, tech"
541,818600200,Operations Research Analyst (PUBLIC NOTICE),Department of the Air Force - Agency Wide,Department of the Air Force,"data, analyst, statistics, research"
864,834408100,Diagnostic Radiologic Technologist (MRI),Veterans Health Administration,Department of Veterans Affairs,"data, tech"
1416,832159000,Electronics Engineer - DIRECT HIRE,U.S. Army Corps of Engineers,Department of the Army,data
1646,834390000,Medical Instrument Technician (Polysomnography),Veterans Health Administration,Department of Veterans Affairs,"data, tech"
1647,834390100,Medical Instrument Technician (Polysomnography),Veterans Health Administration,Department of Veterans Affairs,"data, tech"
1937,742011400,OCA-ELECTRONIC MEASUREMENT EQUIPMENT MECHANIC,Air National Guard Units,Department of the Air Force,data
2751,833950900,Medical Instrument Technologist (Diagnostic Ul...,Veterans Health Administration,Department of Veterans Affairs,"data, tech"
2938,833950800,Medical Instrument Technologist (Diagnostic Ul...,Veterans Health Administration,Department of Veterans Affairs,"data, tech"


In [24]:
df['IsDataBuyer'] = df['CombinedText'].str.contains(pattern, case=False, na=False).astype(int)
print(df['IsDataBuyer'].value_counts())


IsDataBuyer
0    5817
1      12
Name: count, dtype: int64


In [33]:
from rapidfuzz import fuzz, process

# Define your list of signal phrases again
signal_phrases = [
    # Existing
    "data acquisition", "data procurement", "procure data", "purchase data",
    "buy data", "acquiring data", "data sourcing", "data licensing", 
    "external data", "third-party data", "data vendor", 
    "data provider", "data contracts", "contracting data", "data subscriptions",
    "vendor management", "commercial data",

    # New additions
    "data assets", "data commercialization", "procurement of data", "licensed data",
    "external data sources", "data aggregators", "data monetization",
    "sourcing external data", "partner data", "data purchasing agreements",
    "data ingestion", "subscription data", "data acquisition strategy",
    "data buying", "external datasets", "external partnerships", "data sharing agreements",
    "data acquisition channels", "third-party data sources", "sourcing data providers",
    "managing data vendors", "data reseller", "external data vendors", "contracted data"
]


# Function to check if any phrase matches fuzzily above a threshold
def fuzzy_match_phrases(text, phrases, threshold=85):
    for phrase in phrases:
        score = fuzz.partial_ratio(phrase.lower(), text.lower())
        if score >= threshold:
            return phrase  # Return the matching phrase
    return None

# Apply fuzzy matching
df['FuzzyMatchedPhrase'] = df['CombinedText'].apply(lambda x: fuzzy_match_phrases(x, signal_phrases, threshold=85))
df['IsFuzzyMatch'] = df['FuzzyMatchedPhrase'].notnull().astype(int)

# Show counts
print("Fuzzy match summary:")
print(df['IsFuzzyMatch'].value_counts())

# Filter and display fuzzy matches
fuzzy_matches = df[df['IsFuzzyMatch'] == 1]
display(fuzzy_matches[['JobID', 'JobTitle', 'FuzzyMatchedPhrase', 'Department', 'Agency']])


Fuzzy match summary:
IsFuzzyMatch
0    4891
1     938
Name: count, dtype: int64


Unnamed: 0,JobID,JobTitle,FuzzyMatchedPhrase,Department,Agency
17,834079500,Supervisory Medical Records Technician (Coder),contracted data,Veterans Health Administration,Department of Veterans Affairs
18,834079600,Supervisory Medical Records Technician (Coder),contracted data,Veterans Health Administration,Department of Veterans Affairs
19,834497700,Biological Science Technician,data buying,U.S. Fish and Wildlife Service,Department of the Interior
21,834475200,Lead Medical Records Technician (Coder),data provider,Veterans Health Administration,Department of Veterans Affairs
24,834132200,Supervisory Medical Records Technician Coder- ...,vendor management,Veterans Health Administration,Department of Veterans Affairs
...,...,...,...,...,...
5811,824413600,Physician Assistant,data provider,Veterans Health Administration,Department of Veterans Affairs
5812,824434800,Physician Assistant,data provider,Veterans Health Administration,Department of Veterans Affairs
5813,824435600,Physician Assistant,data provider,Veterans Health Administration,Department of Veterans Affairs
5817,833667200,Air Conditioning Equipment Mechanic,vendor management,Veterans Health Administration,Department of Veterans Affairs


In [34]:
# Combine both binary indicators into a single label
df['IsLikelyDataBuyer'] = ((df['IsDataBuyer'] == 1) | (df['IsFuzzyMatch'] == 1)).astype(int)

# View label distribution
print("Combined label distribution (IsLikelyDataBuyer):")
print(df['IsLikelyDataBuyer'].value_counts())

# View matching jobs
combined_matches = df[df['IsLikelyDataBuyer'] == 1]
display(combined_matches[['JobID', 'JobTitle', 'FuzzyMatchedPhrase', 'IsDataBuyer', 'IsFuzzyMatch', 'Department', 'Agency']])


Combined label distribution (IsLikelyDataBuyer):
IsLikelyDataBuyer
0    4891
1     938
Name: count, dtype: int64


Unnamed: 0,JobID,JobTitle,FuzzyMatchedPhrase,IsDataBuyer,IsFuzzyMatch,Department,Agency
17,834079500,Supervisory Medical Records Technician (Coder),contracted data,0,1,Veterans Health Administration,Department of Veterans Affairs
18,834079600,Supervisory Medical Records Technician (Coder),contracted data,0,1,Veterans Health Administration,Department of Veterans Affairs
19,834497700,Biological Science Technician,data buying,0,1,U.S. Fish and Wildlife Service,Department of the Interior
21,834475200,Lead Medical Records Technician (Coder),data provider,0,1,Veterans Health Administration,Department of Veterans Affairs
24,834132200,Supervisory Medical Records Technician Coder- ...,vendor management,0,1,Veterans Health Administration,Department of Veterans Affairs
...,...,...,...,...,...,...,...
5811,824413600,Physician Assistant,data provider,0,1,Veterans Health Administration,Department of Veterans Affairs
5812,824434800,Physician Assistant,data provider,0,1,Veterans Health Administration,Department of Veterans Affairs
5813,824435600,Physician Assistant,data provider,0,1,Veterans Health Administration,Department of Veterans Affairs
5817,833667200,Air Conditioning Equipment Mechanic,vendor management,0,1,Veterans Health Administration,Department of Veterans Affairs


In [35]:
# Save combined matches to CSV
combined_matches.to_csv('C://Users//...likely_data_buyer_jobs.csv', index=False)
print("Saved combined likely data buyer jobs to: likely_data_buyer_jobs.csv")


Saved combined likely data buyer jobs to: likely_data_buyer_jobs.csv


In [37]:
large_agencies = [
    "Department of Defense", "Department of Veterans Affairs", "Department of the Treasury",
    "Department of Homeland Security", "Department of Health and Human Services",
    "Department of Justice", "Department of the Army"
]

medium_agencies = [
    "Department of Transportation", "Department of Commerce", "Department of Agriculture",
    "Department of Energy", "Department of the Interior", "National Aeronautics and Space Administration"
]

# Any agency not listed above will be considered "Small"
def classify_agency_size(agency):
    if agency in large_agencies:
        return 'Large'
    elif agency in medium_agencies:
        return 'Medium'
    else:
        return 'Small'

# Apply classification
df['AgencySize'] = df['Agency'].apply(classify_agency_size)


In [38]:
# Save full annotated DataFrame to CSV (includes all jobs + labels)
df.to_csv('C://Users//...all_jobs_with_data_buyer_labels.csv', index=False)
print("Saved full job listings with data buyer indicators to: all_jobs_with_data_buyer_labels.csv")


Saved full job listings with data buyer indicators to: all_jobs_with_data_buyer_labels.csv


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5829 entries, 0 to 5828
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   JobID               5829 non-null   int64  
 1   JobTitle            5829 non-null   object 
 2   JobDescription      5827 non-null   object 
 3   KeyDuties           5829 non-null   object 
 4   Department          5829 non-null   object 
 5   Agency              5829 non-null   object 
 6   SearchKeywords      5829 non-null   object 
 7   CombinedText        5829 non-null   object 
 8   IsDataBuyer         5829 non-null   int32  
 9   DataBuyerScore      5829 non-null   float64
 10  PredictedDataBuyer  5829 non-null   int32  
 11  FuzzyMatchedPhrase  938 non-null    object 
 12  IsFuzzyMatch        5829 non-null   int32  
 13  IsLikelyDataBuyer   5829 non-null   int32  
 14  AgencySize          5829 non-null   object 
dtypes: float64(1), int32(4), int64(1), object(9)
memory usa