In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# ============================
# 1. Install & Import Libraries
# ============================
!pip install python-docx PyPDF2 nltk scikit-learn docx2txt

import os
import docx2txt
import PyPDF2
import nltk
import re
import string
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('stopwords')
from nltk.corpus import stopwords

# ============================
# 2. Text Extraction Functions
# ============================
def extract_text_from_pdf(path):
    text = ""
    with open(path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
    return text

def extract_text_from_docx(path):
    return docx2txt.process(path)

def extract_resume_text(path):
    if path.endswith(".pdf"):
        return extract_text_from_pdf(path)
    elif path.endswith(".docx"):
        return extract_text_from_docx(path)
    else:
        return ""

# ============================
# 3. Preprocessing Function
# ============================
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"\d+", "", text)  # remove numbers
    text = text.translate(str.maketrans("", "", string.punctuation))  # remove punctuation
    text = text.strip()
    tokens = text.split()
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return " ".join(tokens)

# ============================
# 4. Experience Extraction Function (Improved)
# ============================
def extract_years_of_experience(text):
    text = text.lower()
    years = []

    # Pattern 1: "X years", "X+ years", "X yrs", "X+ yrs", decimals too
    pattern1 = r'(\d+(?:\.\d+)?)\s*\+?\s*(years?|yrs?)'

    # Pattern 2: "over X years", "more than X years"
    pattern2 = r'(?:over|more than)\s+(\d+(?:\.\d+)?)\s*(years?|yrs?)'

    # Pattern 3: "2018 - 2022", "2017 to 2021"
    pattern3 = r'(\d{4})\s*[-to]+\s*(\d{4})'

    # Pattern 4: "Since 2015", "From 2016"
    pattern4 = r'(since|from)\s+(\d{4})'

    # Pattern 5: "experience of X years"
    pattern5 = r'experience\s+of\s+(\d+(?:\.\d+)?)\s*(years?|yrs?)'

    # Pattern 6 (NEW): "X months", "X+ months", "over X months"
    pattern6 = r'(\d+(?:\.\d+)?)\s*\+?\s*(months?|mnths?)'

    # Current year for "since/from"
    current_year = pd.Timestamp.now().year

    # --- Match pattern 1 (years) ---
    matches1 = re.findall(pattern1, text)
    years += [float(m[0]) for m in matches1]

    # --- Match pattern 2 (years) ---
    matches2 = re.findall(pattern2, text)
    years += [float(m[0]) for m in matches2]

    # --- Match pattern 3 (date ranges like 2018 - 2022) ---
    matches3 = re.findall(pattern3, text)
    for start, end in matches3:
        diff = int(end) - int(start)
        if 0 < diff < 50:  # To avoid impossible ranges
            years.append(diff)

    # --- Match pattern 4 (since/from year, e.g., Since 2015) ---
    matches4 = re.findall(pattern4, text)
    for _, year in matches4:
        diff = current_year - int(year)
        if 0 < diff < 50:  # Reasonable year difference
            years.append(diff)

    # --- Match pattern 5 (experience of X years) ---
    matches5 = re.findall(pattern5, text)
    years += [float(m[0]) for m in matches5]

    # --- Match pattern 6 (months converted to years) ---
    matches6 = re.findall(pattern6, text)
    months_as_years = [float(m[0]) / 12 for m in matches6 if float(m[0]) > 0]
    years += months_as_years

    # Return the highest valid value (or 0 if none found)
    return round(max(years), 2) if years else 0

from sklearn.preprocessing import MinMaxScaler

def combine_scores(similarity_scores, experience_list, alpha=0.7, beta=0.3):
    # Reshape to 2D arrays for scaler
    similarity_array = np.array(similarity_scores).reshape(-1, 1)
    experience_array = np.array(experience_list).reshape(-1, 1)

    # Normalize both to [0, 1]
    scaler = MinMaxScaler()
    norm_similarity = scaler.fit_transform(similarity_array).flatten()
    norm_experience = scaler.fit_transform(experience_array).flatten()

    # Weighted sum
    combined_score = alpha * norm_similarity + beta * norm_experience
    return combined_score


# ============================
# 5. Define Your Agent (After Functions)
# ============================
class ResumeScreeningAgent:
    def __init__(self, job_description, candidate_experience):
        self.job_description = job_description
        self.job_vector = None
        self.vectorizer = TfidfVectorizer()
        self.candidate_experience = candidate_experience

    def perceive(self, resumes):
        self.resume_vectors = self.vectorizer.fit_transform(resumes)
        self.job_vector = self.vectorizer.transform([self.job_description])

    def think(self):
        self.similarity_scores = cosine_similarity(self.resume_vectors, self.job_vector).flatten()

    def act(self):
        # Use precomputed experience
        experience_list = self.candidate_experience

        # Combine scores
        combined_score = combine_scores(self.similarity_scores, experience_list, alpha=0.7, beta=0.3)

        # Rank the resumes based on combined score
        ranked_results = pd.DataFrame({
            'Candidate': candidate_names,
            'Similarity Score': self.similarity_scores,
            'Experience (yrs)': experience_list,
            'Combined Score': combined_score
        }).sort_values(by='Combined Score', ascending=False).reset_index(drop=True)

        print("Top Matching Candidates (Combined Ranking):")
        print(ranked_results.head(10))

# ============================
# 6. Loading Resumes
# ============================
resume_folder = "/content/drive/My Drive/Resumes"  # Your shared folder path

candidate_names = []
resumes = []
candidate_experience = []

for filename in os.listdir(resume_folder):
    filepath = os.path.join(resume_folder, filename)
    if filepath.endswith(".pdf") or filepath.endswith(".docx"):
        raw_text = extract_resume_text(filepath)
        cleaned_text = preprocess_text(raw_text)

        resumes.append(cleaned_text)
        candidate_names.append(filename)

        years_of_exp = extract_years_of_experience(raw_text)
        candidate_experience.append(years_of_exp)

# ============================
# 7. Run the Agent
# ============================
job_description_text = """
We are hiring a Data Analyst with strong knowledge of Python, SQL, and data visualization tools. Experience with machine learning and business analysis is a plus.
"""

agent = ResumeScreeningAgent(job_description_text, candidate_experience)
agent.perceive(resumes)
agent.think()
agent.act()



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Top Matching Candidates (Combined Ranking):
                                     Candidate  Similarity Score  \
0                   rs_AYUSH - AYUSH AYUSH.pdf          0.353692   
1  Tashi_Resume2 (2) - TASHI CHODEN BHUTIA.pdf          0.285162   
2       Mansi Gambhir _ CV - Mansi Gambhir.pdf          0.234336   
3        MokshSharma_Resume - Moksh Sharma.pdf          0.120972   
4                   KomalGoel - Komal Goel.pdf          0.175822   
5                              Profile (6).pdf          0.009343   
6         Eesha_Singh_Resume - Eesha Singh.pdf          0.066711   
7                rishika-1 - RISHIKA SINGH.pdf          0.147108   
8                              Profile (3).pdf          0.042657   
9                              Profile (2).pdf          0.104298   

   Experience (yrs)  Combined Score  
0              0.00        0.700000  
1              0.00        0.564370  
2              0.00        0.463779  
3              3.50        0.389418  
4              0.