In [2]:
import nltk
nltk.download('stopwords')

import os
import re
import time
import numpy as np
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import pdfplumber
from docx import Document
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
import joblib
from scipy import sparse

# Set device for faster BERT processing
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# Load BERT tokenizer and model to appropriate device
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased").to(DEVICE)

# Preprocessing Function (regex tokenizer)
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    tokens = re.findall(r'\b\w+\b', text)
    tokens = [word for word in tokens if word not in stopwords.words("english")]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return " ".join(tokens)

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            extracted_text = page.extract_text()
            if extracted_text:
                text += extracted_text + "\n"
    return text.strip()

# Extract text from DOCX
def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    text = " ".join([para.text for para in doc.paragraphs])
    return text

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Dataset.txt")
df["clean_text"] = df["Text"].astype(str).apply(preprocess_text)

# Balance dataset
max_count = df['Category'].value_counts().max()
balanced_data = []
for category in df['Category'].unique():
    category_data = df[df['Category'] == category]
    if len(category_data) < max_count:
        balanced_category_data = resample(category_data, replace=True, n_samples=max_count, random_state=42)
    else:
        balanced_category_data = resample(category_data, replace=False, n_samples=max_count, random_state=42)
    balanced_data.append(balanced_category_data)

df = pd.concat(balanced_data)

# Label encoding and TF-IDF
label_encoder = LabelEncoder()
df["category_label"] = label_encoder.fit_transform(df["Category"])
joblib.dump(label_encoder, 'label_encoder.joblib')

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df["clean_text"])
sparse.save_npz("tfidf_matrix.npz", tfidf_matrix)
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.joblib")
df.to_csv("cleaned_dataset.csv", index=False)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_text"], df["category_label"], test_size=0.2, random_state=42
)

# BERT batch embedding
def get_batch_embeddings(texts, batch_size=16):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(DEVICE)
        with torch.no_grad():
            outputs = bert_model(**inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_embeddings)
        torch.cuda.empty_cache()
    return np.array(embeddings)

# Generate embeddings
X_train_embeddings = get_batch_embeddings(list(X_train))
X_test_embeddings = get_batch_embeddings(list(X_test))
job_embeddings = get_batch_embeddings(df["clean_text"].tolist())

np.save("X_train_embeddings.npy", X_train_embeddings)
np.save("X_test_embeddings.npy", X_test_embeddings)
np.save("y_train.npy", y_train)
np.save("y_test.npy", y_test)
np.save("job_embeddings.npy", job_embeddings)

# Train classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_embeddings, y_train)
joblib.dump(clf, "classifier.joblib")

# Evaluate
accuracy = clf.score(X_test_embeddings, y_test)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Inference
clf = joblib.load("classifier.joblib")
label_encoder = joblib.load("label_encoder.joblib")
tfidf_vectorizer = joblib.load("tfidf_vectorizer.joblib")
tfidf_matrix = sparse.load_npz("tfidf_matrix.npz")
job_embeddings = np.load("job_embeddings.npy")
df = pd.read_csv("cleaned_dataset.csv")

def process_resume(file_path):
    if file_path.endswith(".pdf"):
        resume_text = extract_text_from_pdf(file_path)
    elif file_path.endswith(".docx"):
        resume_text = extract_text_from_docx(file_path)
    else:
        raise ValueError("Unsupported file format")

    cleaned_resume = preprocess_text(resume_text)
    resume_embedding = get_batch_embeddings([cleaned_resume])[0].reshape(1, -1)

    resume_tfidf = tfidf_vectorizer.transform([cleaned_resume])
    tfidf_similarities = cosine_similarity(resume_tfidf, tfidf_matrix).flatten()
    bert_similarities = cosine_similarity(resume_embedding, job_embeddings).flatten()

    final_similarity = 0.5 * tfidf_similarities + 0.5 * bert_similarities
    top_category_index = np.argmax(final_similarity)
    job_category = df.iloc[top_category_index]["Category"]
    match_score = final_similarity[top_category_index] * 10

    return job_category, match_score




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [4]:
import os
import re
import torch
import numpy as np
import pandas as pd
import pdfplumber
from docx import Document
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from transformers import BertTokenizer, BertModel, pipeline
from sklearn.metrics.pairwise import cosine_similarity
import joblib
from scipy import sparse

# Load models and data
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased").to(DEVICE)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0 if torch.cuda.is_available() else -1)

label_encoder = joblib.load("/content/drive/MyDrive/RESUME(ATS) SAVED MODEL/label_encoder.joblib")
tfidf_vectorizer = joblib.load("/content/drive/MyDrive/RESUME(ATS) SAVED MODEL/tfidf_vectorizer.joblib")
tfidf_matrix = sparse.load_npz("/content/drive/MyDrive/RESUME(ATS) SAVED MODEL/tfidf_matrix.npz")
classifier = joblib.load("/content/drive/MyDrive/RESUME(ATS) SAVED MODEL/classifier.joblib")
job_embeddings = np.load("/content/drive/MyDrive/RESUME(ATS) SAVED MODEL/job_embeddings.npy")
df = pd.read_csv("/content/drive/MyDrive/RESUME(ATS) SAVED MODEL/cleaned_dataset.csv")

# Preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    tokens = re.findall(r'\b\w+\b', text)
    tokens = [word for word in tokens if word not in stopwords.words("english")]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return " ".join(tokens)

# Extract text
def extract_text(file_path):
    if file_path.endswith(".pdf"):
        text = ""
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                extracted = page.extract_text()
                if extracted:
                    text += extracted + "\n"
        return text.strip()
    elif file_path.endswith(".docx"):
        doc = Document(file_path)
        return " ".join([para.text for para in doc.paragraphs])
    else:
        raise ValueError("Unsupported file format")

# BERT embedding
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(DEVICE)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

# Main resume analysis
def analyze_resume(file_path):
    # Step 1: Extract and clean text
    raw_text = extract_text(file_path)
    cleaned_text = preprocess_text(raw_text)

    # Step 2: Summarize
    summary = summarizer(raw_text, max_length=150, min_length=50, do_sample=False)[0]["summary_text"]

    # Step 3: Embeddings and similarity
    resume_embedding = get_embedding(cleaned_text).reshape(1, -1)
    resume_tfidf = tfidf_vectorizer.transform([cleaned_text])

    tfidf_scores = cosine_similarity(resume_tfidf, tfidf_matrix).flatten()
    bert_scores = cosine_similarity(resume_embedding, job_embeddings).flatten()
    final_similarity = 0.5 * tfidf_scores + 0.5 * bert_scores

    best_index = np.argmax(final_similarity)
    predicted_category = df.iloc[best_index]["Category"]
    match_score = final_similarity[best_index] * 10
    ats_score = match_score * 10  # Optional scaling to 100

    # Step 4: Classification (optional)
    predicted_label = classifier.predict(resume_embedding)[0]
    classified_category = label_encoder.inverse_transform([predicted_label])[0]

    return {
        "summary": summary,
        "predicted_category_by_similarity": predicted_category,
        "predicted_category_by_classifier": classified_category,
        "match_score": round(match_score, 2),
        "ats_score": round(ats_score, 2)
    }


Device set to use cuda:0


In [10]:
result = analyze_resume("/content/Ankit Acharjee_203.pdf")

print("🔹 Resume Summary:")
print(result["summary"])
print("\n Predicted Category (Cosine Similarity):", result["predicted_category_by_similarity"])
print(f" Match Score: {result['match_score']}/10")
print(f" ATS Score: {result['ats_score']}/100")




🔹 Resume Summary:
Ankit AcharJEE is a graduate of Tripura Institute of Technology, Narsingarh, Tripura, India. He is currently working as a React Developer Intern at GitHub. He has developed a video conferencing web application using Next.js 14.

 Predicted Category (Cosine Similarity): React Developer
 Match Score: 6.11/10
 ATS Score: 61.07/100
