In [None]:
# 📦 Install dependencies
!pip install transformers
!pip install spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m95.4 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# 📚 Imports
import pandas as pd
import spacy
from transformers import pipeline


In [None]:
# 📁 Load Excel
df = pd.read_excel('Job Descriptions.xlsx')
df.head()


Unnamed: 0,Job Descriptions
0,Greetings from HDFC Sales (Subsidiary of HDFC ...
1,"Dear Aspirants,\n\nOrcapod is looking for Fema..."
2,â€¢ Involved with full life cycle of the recru...
3,**Job description**\n\n \n\n\n * Data Entry ...
4,"**You're ideal for this role, if**\n\n * You'..."


In [None]:
# 🤖 Load Models
nlp_spacy = spacy.load("en_core_web_sm")

minilm_qa = pipeline("question-answering", model="deepset/minilm-uncased-squad2")
t5_qa = pipeline("question-answering", model="t5-small")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

Some weights of the model checkpoint at deepset/minilm-uncased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/107 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Some weights of T5ForQuestionAnswering were not initialized from the model checkpoint at t5-small and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Device set to use cpu


In [None]:
# 🧠 Define Extraction Functions

# 1. Keyword Heuristic
def extract_keywords(text):
    text = text.lower()
    for word in text.split():
        if word.isdigit():
            idx = text.find(word)
            snippet = text[idx:idx + 20]
            if any(k in snippet for k in ["year", "yrs", "yr"]):
                return word
    return "Not found"

# 2. spaCy NER
def extract_spacy(text):
    doc = nlp_spacy(text)
    for ent in doc.ents:
        if ent.label_ == "CARDINAL":
            span = text[ent.start_char:ent.end_char + 15].lower()
            if "year" in span or "yrs" in span:
                return ent.text
    return "Not found"

# 3. MiniLM QA
def extract_minilm(text):
    try:
        result = minilm_qa(question="How many years of experience is required?", context=text)
        return result.get('answer', 'Not found')
    except:
        return "Error"

# 4. T5 QA
def extract_t5(text):
    try:
        result = t5_qa(question="How many years of experience is required?", context=text)
        return result.get('answer', 'Not found')
    except:
        return "Error"

# 5. Custom Rule Heuristic
def extract_custom_rule(text):
    lower_text = text.lower()
    phrases = ["minimum", "at least", "experience of", "requires", "need"]
    for phrase in phrases:
        if phrase in lower_text:
            start = lower_text.find(phrase)
            snippet = lower_text[start:start + 50]
            for word in snippet.split():
                if word.isdigit():
                    return word
    return "Not found"


In [None]:
# 📝 Apply All 5 Techniques
df['Keyword_Heuristic'] = df['Job Descriptions'].apply(lambda x: extract_keywords(str(x)))

In [None]:
df['Spacy_Experience'] = df['Job Descriptions'].apply(lambda x: extract_spacy(str(x)))

In [None]:
df['MiniLM_Experience'] = df['Job Descriptions'].apply(lambda x: extract_minilm(str(x)))

In [None]:
df['T5_Experience'] = df['Job Descriptions'].apply(lambda x: extract_t5(str(x)))

In [None]:
df['Custom_Rule_Experience'] = df['Job Descriptions'].apply(lambda x: extract_custom_rule(str(x)))


In [None]:
df.head()



Unnamed: 0,Job Descriptions,Keyword_Heuristic,Spacy_Experience,MiniLM_Experience,T5_Experience,Custom_Rule_Experience
0,Greetings from HDFC Sales (Subsidiary of HDFC ...,8,Not found,8 -12 years,bulk bookings etc.\n\n \n\n\n**_Desired,30
1,"Dear Aspirants,\n\nOrcapod is looking for Fema...",1,Not found,have4,Saturday;,Not found
2,â€¢ Involved with full life cycle of the recru...,Not found,Not found,full life cycle,the Business Process of US recruitment. \nâ€¢...,Not found
3,**Job description**\n\n \n\n\n * Data Entry ...,Not found,Not found,resume,send resume on job@gstsuvidhakendra.org,Not found
4,"**You're ideal for this role, if**\n\n * You'...",Not found,Not found,Sales Experience with the EdTech domain,weekly revenue and enrollment targets.\n\n \n...,Not found


In [None]:
# 💾 Save the Output
output_file = "/content/experience_extraction_fast_5_miniLM.xlsx"
df.to_excel(output_file, index=False)
print("✅ Output saved to:", output_file)


✅ Output saved to: /content/experience_extraction_fast_5_miniLM.xlsx


In [None]:
# 📥 Download the Result
from google.colab import files
files.download(output_file)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>