In [1]:
import requests
import numpy as np
import pandas as pd

from local_config import userAgent, authKey
import feature_engine as fe
import enums
import pandas as pd

In [2]:
class USA_JOBS_SEARCH:
    """
    This class queries the USA Jobs Search API for specific parameters and parses 
    the results into a tidy pandas dataframe.

    This class uses the requests, pandas, and a local config file to pass API credentials

    Example usage:
    
    # create an USA Jobs Search Object
    internships = USA_JOBS_SEARCH({
        "Keyword": 'Internship',
        "PositionOfferingType": 'Student'
        })
    
    # Attributes
        result.df : pandas dataframe
        results.parameters_dict : dictionary of search parameters
        results.base_url: API base url

    """

    def __init__(self, parameters_dict, userAgent = userAgent, authKey = authKey):
        self.parameters_dict = parameters_dict
        self.headers = {
            "Host": "data.usajobs.gov",
            "User-Agent": userAgent,
            "Authorization-Key": authKey,
            }
        self.base_url = "https://data.usajobs.gov/api/search?ResultsPerPage=500"
        
        self.initial_response = self.get_response_json(self.parameters_dict)
        self.response_number_of_pages = self._get_number_of_pages()
        self.all_api_results = self._get_all_api_results()
        self.df = self._get_parsed_results_df()
        

    def get_response_json(self,parameters):
        response = requests.get(self.base_url, params=parameters, headers=self.headers)
        print(response.url)
        data = response.json()
    
        return data

    def _get_number_of_pages(self):
        return int(self.initial_response['SearchResult']['UserArea']['NumberOfPages']) # returns an intenger

    def _get_all_api_results(self):
        all_api_page_results = []
        for i in range(self.response_number_of_pages):
            parameters = self.parameters_dict
            parameters['Page'] = i + 1
            

            page_results = self.get_response_json(parameters)
            page_results = page_results['SearchResult']['SearchResultItems']
            all_api_page_results.append(page_results)

        all_api_results = []
        for lst in all_api_page_results:
            all_api_results.extend(lst)

        
        all_api_results = [all_api_results[i]['MatchedObjectDescriptor'] for i in range(len(all_api_results))]

        return all_api_results


    def _parse_api_results(self, oppty):
        internship_data = dict(
            usajobs_id = oppty['PositionID'],
            position_title = oppty['PositionTitle'],
            position_uri = oppty['PositionURI'],
            position_apply_uri = oppty['ApplyURI'],
            position_location = oppty['PositionLocationDisplay'],
            organization = oppty['OrganizationName'],
            department = oppty['DepartmentName'],
            qualifications = oppty['QualificationSummary'],
            min_pay = oppty['PositionRemuneration'][0]['MinimumRange'],
            max_pay = oppty['PositionRemuneration'][0]['MaximumRange'],
            pay_type = oppty['PositionRemuneration'][0]['RateIntervalCode'],
            position_offering_type = oppty['PositionOfferingType'][0]['Name'],
            job_category = oppty['JobCategory'][0]['Name'],
            job_summary = oppty['UserArea']['Details']['JobSummary'],
            agency_marketing_statement = oppty['UserArea']['Details']['AgencyMarketingStatement'],
            major_duties = oppty['UserArea']['Details']['MajorDuties'][0],
            education = oppty['UserArea']['Details']['Education'],
            requirements = oppty['UserArea']['Details']['Requirements'],
            evaluation = oppty['UserArea']['Details']['Evaluations'],
            key_requirements = oppty['UserArea']['Details']['KeyRequirements'],
        )
        return internship_data
    
    def _get_parsed_results_df(self):

        parsed_results_dictionary =  [self._parse_api_results(result) for result in self.all_api_results]
        parsed_results_dataframe = pd.DataFrame(parsed_results_dictionary)

        return parsed_results_dataframe


In [3]:
internships = USA_JOBS_SEARCH({
    "Keyword": 'student',
    "HiringPath": 'student',
})

https://data.usajobs.gov/api/search?ResultsPerPage=500&Keyword=student&HiringPath=student
https://data.usajobs.gov/api/search?ResultsPerPage=500&Keyword=student&HiringPath=student&Page=1


In [4]:
# How many records are there?
df = internships.df


In [5]:
position_title_filters =[
    "student",
    "volunteer",
    "trainee",
    "internship",
    "intern",
    "environmental",
    "environment"
]

position_offering_types_filters = [
    "internships",
    "internship",

]

job_category_filters = [
    "student",
    "trainee",
]


drop_from_position_offering_type = [
    'permanent',
    'years',
    'yrs',
    'indefinite',
    '2yrs',
    '3yrS',
    'full-time',
    'full time',
    'more than 1-year',
    '4 year term',
    '3 year',
    'multiple appointment types',
    'regular category position',
]

department_filters = [
    'Legislative Branch',
    'Department of the Air Force',
    'Department of Veterans Affairs',
    'Department of the Army',
    'Department of Health And Human Services',
    'Department of Defense',
    'Department of the Navy',
    'Department of Education',
    'Department of Labor',
    'Department of Justice',
    'Department of Transportation',
    'Judicial Branch',
    'Department of the Treasury',
    'Department of Homeland Security'
]

department_filters = [d.lower() for d in department_filters]

organization_filters = [
 'Office of the Inspector General, USPS',
 'Securities and Exchange Commission',
 'National Labor Relations Board',
 'Federal Deposit Insurance Corporation',
 'Consumer Financial Protection Bureau',
 'Development Finance Corporation (formerly Overseas Private Investment Corporation)',
 'Office of Personnel Management'
]

organization_filters = [o.lower() for o in organization_filters]

In [6]:
# position_titles_filter
test_df = df[(df.position_title.str.lower().str.contains('|'.join(position_title_filters))) |
    (df.position_offering_type.str.lower().str.contains('|'.join(position_offering_types_filters))) |
    (df.job_category.str.lower().str.contains('|'.join(job_category_filters)))] 
    

# filtered for agencies
test_df = test_df[~test_df.department.str.lower().str.contains('|'.join(department_filters))]

# filtered for agencies
test_df = test_df[~test_df.organization.str.lower().str.contains('|'.join(organization_filters))]

# filtered for position_offering_types
test_df = test_df[~test_df.position_offering_type.str.lower().str.contains('|'.join(drop_from_position_offering_type))]


  test_df = test_df[~test_df.organization.str.lower().str.contains('|'.join(organization_filters))]


In [7]:
print('Min Pay: ', test_df.min_pay.astype(float).describe())

Min Pay:  count       51.000000
mean     10066.921373
std      16822.141237
min          0.000000
25%          0.000000
50%         15.470000
75%      26916.000000
max      54727.000000
Name: min_pay, dtype: float64


In [8]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51 entries, 2 to 157
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   usajobs_id                  51 non-null     object
 1   position_title              51 non-null     object
 2   position_uri                51 non-null     object
 3   position_apply_uri          51 non-null     object
 4   position_location           51 non-null     object
 5   organization                51 non-null     object
 6   department                  51 non-null     object
 7   qualifications              51 non-null     object
 8   min_pay                     51 non-null     object
 9   max_pay                     51 non-null     object
 10  pay_type                    51 non-null     object
 11  position_offering_type      51 non-null     object
 12  job_category                51 non-null     object
 13  job_summary                 51 non-null     object


In [9]:
list(test_df.department.unique())

['Department of Agriculture',
 'Other Agencies and Independent Organizations',
 'Department of the Interior',
 'Department of Commerce',
 'National Foundation on the Arts and the Humanities']

In [10]:
test_df.to_csv('usa_jobs_internships.csv', index=False)

In [11]:
list(test_df[test_df.department == 'Other Agencies and Independent Organizations'].organization.unique())

['Federal Maritime Commission',
 'Environmental Protection Agency',
 'Federal Trade Commission',
 'U.S. Agency for International Development',
 'Development Finance Corporation (formerly Overseas Private Investment Corporation)']