# spaCy Pipeline (Rule-Based + Matcher)

### 1. Install spaCy and load the en_core_web_sm model using Python.

In [1]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---- ----------------------------------- 1.6/12.8 MB 8.2 MB/s eta 0:00:02
     ---------- ----------------------------- 3.4/12.8 MB 7.6 MB/s eta 0:00:02
     --------------- ------------------------ 5.0/12.8 MB 7.8 MB/s eta 0:00:02
     -------------------- ------------------- 6.6/12.8 MB 7.7 MB/s eta 0:00:01
     -------------------------- ------------- 8.4/12.8 MB 7.8 MB/s eta 0:00:01
     ---------------------------------- ----- 11.0/12.8 MB 8.5 MB/s eta 0:00:01
     ---------------------------------------  12.6/12.8 MB 8.6 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 8.0 MB/s  0:00:01
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [8]:
import spacy
from spacy.matcher import PhraseMatcher
import json
nlp = spacy.load("en_core_web_sm")
def normalize_skills(skill_list):
    return sorted(set(skill.lower() for skill in skill_list))

### 2. Using PhraseMatcher, extract the skills Python and SQL from the text: 'I have experience in Python and SQL.'

In [9]:
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")  # case-insensitive
skills_basic = ["Python", "SQL"]
patterns = [nlp.make_doc(skill) for skill in skills_basic]
matcher.add("BASIC_SKILLS", patterns)

text_1 = "I have experience in Python and SQL."
doc_1 = nlp(text_1)

extracted_skills_1 = [doc_1[start:end].text for _, start, end in matcher(doc_1)]

In [10]:
print(extracted_skills_1)

['Python', 'SQL']


### 3. Create a Python list of five technical skills and convert them into PhraseMatcher patterns.

In [21]:
tech_skills = ['Python','SQL','Machine Learning','Java','NLP']
tech_patterns = [nlp.make_doc(skill) for skill in tech_skills]

### 4. Write code to convert all extracted skills into lowercase.

In [22]:
lower_cased = []
def lower_case(extracted_skills_1):
    for i in extracted_skills_1:
        lower_cased.append(i.lower())
lower_case(extracted_skills_1)
print(lower_cased)
extracted_skills = lower_cased

['python', 'sql']


### 5. Remove duplicate skills from a list of extracted skills.

In [23]:
extracted_skills = set(extracted_skills)
print(extracted_skills)
extracted_skills = list(extracted_skills)
print(extracted_skills)

{'sql', 'python'}
['sql', 'python']


### 6. Extract technical skills from the sentence: 'Experience in Python, NLP, and Machine Learning with SQL.'

In [24]:
matcher_tech = PhraseMatcher(nlp.vocab, attr="LOWER")
matcher_tech.add("TECH_SKILLS", tech_patterns)

text_2 = "Experience in Python, NLP, and Machine Learning with SQL."
doc_2 = nlp(text_2)

tech_extracted_2 = normalize_skills(
    [doc_2[start:end].text for _, start, end in matcher_tech(doc_2)]
)
tech_extracted_2

['machine learning', 'nlp', 'python', 'sql']

### 7. Identify soft skills from a resume text using token comparison and a predefined soft skill list.

In [26]:
soft_skill_list = [
    "communication",
    "teamwork",
    "leadership",
    "problem solving",
    "time management"
]

def extract_soft_skills(text):
    doc = nlp(text.lower())
    found = []
    for skill in soft_skill_list:
        if skill in doc.text:
            found.append(skill)
    return normalize_skills(found)

resume_text_soft = "Strong communication and teamwork skills with leadership qualities."
soft_skills_extracted = extract_soft_skills(resume_text_soft)
soft_skills_extracted

['communication', 'leadership', 'teamwork']

### 8. Configure PhraseMatcher to perform case-insensitive matching.

In [27]:
matcher2 = PhraseMatcher(nlp.vocab, attr = 'LOWER')

### 9. Store extracted technical and soft skills in the JSON structure: { technical_skills: [], soft_skills: [] }.

In [28]:
json_struct = {
    'technical_skills':tech_extracted_2,
    'soft_skills':soft_skills_extracted
}
json_struct

{'technical_skills': ['machine learning', 'nlp', 'python', 'sql'],
 'soft_skills': ['communication', 'leadership', 'teamwork']}

### 10. Extract skills from a paragraph that contains repeated skill mentions and ensure duplicates are removed.

In [29]:
text_3 = "Python, Python, SQL and SQL are required."
doc_3 = nlp(text_3)

tech_extracted_3 = normalize_skills(
    [doc_3[start:end].text for _, start, end in matcher_tech(doc_3)]
)
tech_extracted_3

['python', 'sql']

### 11. Ensure skills are correctly extracted from text containing punctuation such as commas and semicolons.

In [30]:
text_4 = "Required skills: Python; SQL, NLP, and Machine Learning."
doc_4 = nlp(text_4)

tech_extracted_4 = normalize_skills(
    [doc_4[start:end].text for _, start, end in matcher_tech(doc_4)]
)
tech_extracted_4

['machine learning', 'nlp', 'python', 'sql']

### 12. Combine PhraseMatcher for technical skills and rule-based token matching for soft skills in a single spaCy pipeline.

In [31]:
def extract_all_skills(text):
    doc = nlp(text)

    tech = normalize_skills(
        [doc[start:end].text for _, start, end in matcher_tech(doc)]
    )
    soft = extract_soft_skills(text)

    return {
        "technical_skills": tech,
        "soft_skills": soft
    }

### 13. Extract skills from multiple sentences and merge them into one normalized output dictionary.

In [32]:
multi_sentence_text = """
I have experience in Python and SQL.
I also worked on NLP projects.
Strong communication and teamwork skills.
"""

skills_multi = extract_all_skills(multi_sentence_text)
skills_multi

{'technical_skills': ['nlp', 'python', 'sql'],
 'soft_skills': ['communication', 'teamwork']}

### 14. Modify skill extraction logic to match SQL but not NoSQL.

In [33]:
sql_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
sql_matcher.add("SQL_ONLY", [nlp.make_doc("SQL")])

text_5 = "Experience with SQL and NoSQL databases."
doc_5 = nlp(text_5)

sql_only = []
for _, start, end in sql_matcher(doc_5):
    if doc_5[start:end].text.lower() == "sql":
        sql_only.append("sql")

sql_only = normalize_skills(sql_only)
sql_only

['sql']

### 15. Given a resume and a job description, extract skills separately and output them as two different JSON objects

In [34]:
resume_text = "Python, SQL, NLP, and strong communication skills."
job_description = "Looking for Machine Learning, SQL, teamwork, and leadership."

resume_skills = extract_all_skills(resume_text)
job_skills = extract_all_skills(job_description)

print("Resume skills:",resume_skills)
print("job_skills:",job_skills)

Resume skills: {'technical_skills': ['nlp', 'python', 'sql'], 'soft_skills': ['communication']}
job_skills: {'technical_skills': ['machine learning', 'sql'], 'soft_skills': ['leadership', 'teamwork']}
