In [15]:
# Importing necessary dependencies
import os
import spacy
import pandas as pd
from transformers import pipeline

In [16]:
# Checking if the GPU is available
device = 0 if torch.cuda.is_available() else -1
print(f"Using {'GPU' if device == 0 else 'CPU'} for inference.")

Using GPU for inference.


In order to get the company names which is basically the companies/institutes where an applicant has been. We will be using a modified BERT model for this purpose. The specific modification used here is **BERT Named Entity Recognition**.

In [17]:
# Loading the necessary models
# Loading spaCy's pre-trained NER model
nlp = spacy.load('en_core_web_sm')

# Loading a BERT-based NER pipeline from Hugging Face's transformers
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", grouped_entities=True)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [18]:
# Reading the previously parsed resumes
input_csv_file = 'parsed_resumes.csv'
df = pd.read_csv(input_csv_file)

In [20]:
# Function to extract company names including educational institutions
def extract_company_names(resume_text):
    """Extract company names from resume text using a combination of BERT and spaCy NER."""
    companies = []

    # Joining the list of resume lines into a single string
    resume_full_text = "\n".join(resume_text)

    # Using spaCy's NER model to extract named entities
    doc = nlp(resume_full_text)

    # Filtering out organizations recognized by spaCy's model
    for ent in doc.ents:
        if ent.label_ == 'ORG':  # 'ORG' refers to Organizations (potentially companies)
            companies.append(ent.text)

    # Now using the BERT NER model to extract entities
    ner_results = ner_pipeline(resume_full_text)

    # Filtering the BERT NER results for organizations (labeled as 'ORG')
    for entity in ner_results:
        if entity['entity_group'] == 'ORG':
            companies.append(entity['word'])

    # Remove duplicates
    companies = list(set(companies))

    return companies

In [None]:
# Iterate over the resumes and extract company names for each applicant
company_data = []

for index, row in df.iterrows():
    applicant_id = row['Applicant_ID']
    resume_text = eval(row['Parsed_Resume'])  # Convert the string list back to an actual list of lines
    company_names = extract_company_names(resume_text)

    company_data.append([applicant_id, company_names])

# Create a DataFrame for the extracted company names
company_df = pd.DataFrame(company_data, columns=['Applicant_ID', 'Company_Names'])

In [22]:
# Output CSV with applicant IDs and their respective company names
output_csv_file = 'applicant_companies.csv'
company_df.to_csv(output_csv_file, index=False)

print(f"Company data has been successfully written to {output_csv_file}")


Company data has been successfully written to applicant_companies.csv
