In [None]:
"""
Automated Resume Keyword Extractor
Ms. Nishigandha Wankhade
"""

In [3]:
import nltk     # Python library use for processing human language data
import re       # Python's Regular Expressions (regex) module for text pattern matching
            # used to clean text, remove unwanted characters, or exact specific patterns like email addresses
import string
from nltk.tokenize import word_tokenize, sent_tokenize   # splits text into individual words (token) and splits text into sentences respectively
from nltk.corpus import stopwords   # imports stopwords (like "the", "is", "and", etc) from nltk corpus
from nltk import pos_tag, ne_chunk  # imports parts-of-speech (POS) tagging and Named Entity Recognition (NER) functions
                        # pos-tag(tokens): assigns POS togs (e.g., noun, verb, adjective) to words
                        # ne_chunk(tagged_words): performed NER to detect names, organizations, locations, etc.
from nltk.tree import Tree    #  Tree class represents hierarchical structures like parse trees in NLTK

In [4]:
# To Download required NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wankh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\wankh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\wankh\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\wankh\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wankh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# Predefined list of skills
SKILLS_LIST = {"Python", "Machine Learning", "Data Science", "SQL", "Excel", "Tableau", "Power BI", 
               "Deep Learning", "TensorFlow", "NLP", "Snowflake", "AWS", "PyTorch", "Pandas"}

In [10]:
def preprocess_text(text):
    """
    Function to Tokenizes and cleans text by removing punctuation and stopwords.
    Input: a resume / text
    Returns: tokens
    """
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum()]  # Remove punctuation
    tokens = [word for word in tokens if word.lower() not in stopwords.words('english')]  # Remove stopwords
    return tokens


"""
    clean_tokens = []
    
    for word in tokens:         # to iterate through tokens to remove punctuation
        if word.isalnum():
            clean_tokens.append(word)  # to append only alphanumeric words to clean_tokens
            
    for word in clean_tokens:
        if word.lower() not in stopwords.words('english'):   # iterating over clean_tokens(), checking each word against stopwords.
            clean_tokens.append(word)
            
    return clean_tokens
"""

"\n    clean_tokens = []\n    \n    for word in tokens:         # to iterate through tokens to remove punctuation\n        if word.isalnum():\n            clean_tokens.append(word)  # to append only alphanumeric words to clean_tokens\n            \n    for word in clean_tokens:\n        if word.lower() not in stopwords.words('english'):   # iterating over clean_tokens(), checking each word against stopwords.\n            clean_tokens.append(word)\n            \n    return clean_tokens\n"

In [21]:
def extract_named_entities(text):
    """
    Function to extracts named entities such as Job Titles and Company Names.
    Input : Text/ resume
    Output: entities
    """
    sentences = sent_tokenize(text)   # split the input text into sentences
    print(f"\n TOKENIZED SENTENCES : \n {sentences}\n")
    
    entities = set()        # uses set to store extracted entities, it ensures unique values (no duplicates)
    
    for sentence in sentences:   # iterate through each sentence in the tokenized list
        words = word_tokenize(sentence)  # splits the sentence into words (tokens)
        print("\n===============================================================================")
        print(f"TOKENIZED WORDS: \n {words} \n")
        
        tagged_words = pos_tag(words)    # assigns a part-of-speech (POS) tag (e.g., noun, verb, adjective) to each word
        print(f"TAGGED WORDS:\n {tagged_words}\n")
        
        chunked_tree = ne_chunk(tagged_words) # identifies named entities from the POS-tagged words and build a tree structure to categorize entities (e.g. PERSON, ORGANIZATION).
        print(f"CHUNKED TREE : \n {chunked_tree} \n")
        
        for subtree in chunked_tree:
            if isinstance(subtree, Tree):     # iterate through the chunks in the tree
                                            # checks if a chunk is a subtree (Tree), means it's a named entity
                entity_name = " ".join([token for token, pos in subtree.leaves()])  # extract actual word(tokens) from the subtree
                                    # joins multi-word entities (e.g. "New York" instead of "New" and "York" separately)
                    
                entities.add(entity_name)
                
    return entities

In [23]:
def extract_skills(tokens):
    """
    A function to match extracted words with predefined skills list.
    Input: tokens / words
    Output : Matching skills
    
    """
    found_skills = set()
    
    for token in tokens:
        if token in SKILLS_LIST:
            found_skills.add(token)
    return found_skills

In [16]:
#==============================================================================================
# Load resume text (replace with actual text input)
resume_text = """ John Doe is a Data Scientist with expertise in Machine Learning, Python, and SQL.
He has worked at Google and Amazon, focusing on NLP and Deep Learning applications.
He is proficient in Power BI and Snowflake for data analysis.
"""

# Preprocess and extract information
tokens = preprocess_text(resume_text)



In [17]:
print(tokens)

['John', 'Doe', 'Data', 'Scientist', 'expertise', 'Machine', 'Learning', 'Python', 'SQL', 'worked', 'Google', 'Amazon', 'focusing', 'NLP', 'Deep', 'Learning', 'applications', 'proficient', 'Power', 'BI', 'Snowflake', 'data', 'analysis']


In [22]:
named_entities = extract_named_entities(resume_text)
print(f"NAMED ENTITIES : \n {named_entities} \n")


 TOKENIZED SENTENCES : 
 [' John Doe is a Data Scientist with expertise in Machine Learning, Python, and SQL.', 'He has worked at Google and Amazon, focusing on NLP and Deep Learning applications.', 'He is proficient in Power BI and Snowflake for data analysis.']


TOKENIZED WORDS: 
 ['John', 'Doe', 'is', 'a', 'Data', 'Scientist', 'with', 'expertise', 'in', 'Machine', 'Learning', ',', 'Python', ',', 'and', 'SQL', '.'] 

TAGGED WORDS:
 [('John', 'NNP'), ('Doe', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('Data', 'NNP'), ('Scientist', 'NN'), ('with', 'IN'), ('expertise', 'NN'), ('in', 'IN'), ('Machine', 'NNP'), ('Learning', 'NNP'), (',', ','), ('Python', 'NNP'), (',', ','), ('and', 'CC'), ('SQL', 'NNP'), ('.', '.')]

CHUNKED TREE : 
 (S
  (PERSON John/NNP)
  (ORGANIZATION Doe/NNP)
  is/VBZ
  a/DT
  Data/NNP
  Scientist/NN
  with/IN
  expertise/NN
  in/IN
  (GPE Machine/NNP)
  Learning/NNP
  ,/,
  (PERSON Python/NNP)
  ,/,
  and/CC
  (ORGANIZATION SQL/NNP)
  ./.) 


TOKENIZED WORDS: 
 ['He', 'h

In [24]:
skills = extract_skills(tokens)
print(f"SKILLS MATCHED: \n {skills}")

SKILLS MATCHED: 
 {'NLP', 'SQL', 'Python', 'Snowflake'}


In [25]:
# Display All Extracted Information
print(f"\n Extracted Named Entities (Job Titles, Companies, etc.): {named_entities}")
print(f"\n Extracted Skills : {skills}")


 Extracted Named Entities (Job Titles, Companies, etc.): {'Google', 'Machine', 'Power', 'John', 'Deep Learning', 'Snowflake', 'Amazon', 'NLP', 'SQL', 'Python', 'Doe'}

 Extracted Skills : {'NLP', 'SQL', 'Python', 'Snowflake'}


In [None]:
"""
Scope for future development:

1. Can convert the script into a web-based tool (Flask, Streamlit).
2. Can improve different text format resume, e.g. PDF, Word documents by using pdfminer or python-docx.
3. For more advance skills extraction, can use TF-IDF or spaCy


"""