In [None]:
# resume_inference_superclean.py

import os
import re
import joblib
import docx
import pdfplumber
import pandas as pd
from scipy.sparse import hstack


# ------------------------------------------------------------------
# PATHS
# ------------------------------------------------------------------
ARTIFACTS_PATH = "/content/drive/MyDrive/Resume_Score/resume_pipeline_output/resume_pipeline_artifacts.joblib"
RESUME_FOLDER = "/content/drive/MyDrive/Resume_Score/resumes/"
OUTPUT_SENTENCE_CSV = "/content/drive/MyDrive/Resume_Score/superclean_sentence_predictions.csv"
OUTPUT_RESUME_SCORES = "/content/drive/MyDrive/Resume_Score/superclean_resume_scores.csv"


# ------------------------------------------------------------------
# LOAD MODEL
# ------------------------------------------------------------------
artifacts = joblib.load(ARTIFACTS_PATH)
tfidf = artifacts["tfidf"]
clf = artifacts["logistic_clf"]
categories = list(artifacts["categories"])
action_verbs = artifacts["action_verbs"]


# ------------------------------------------------------------------
# TEXT EXTRACTION
# ------------------------------------------------------------------
def extract_text(path):
    if path.endswith(".txt"):
        return open(path, "r", errors="ignore").read()

    if path.endswith(".docx"):
        doc = docx.Document(path)
        return "\n".join([p.text for p in doc.paragraphs])

    if path.endswith(".pdf"):
        text = ""
        with pdfplumber.open(path) as pdf:
            for page in pdf.pages:
                t = page.extract_text()
                if t:
                    text += t + "\n"
        return text

    return ""


# ------------------------------------------------------------------
# SENTENCE SPLIT
# ------------------------------------------------------------------
def extract_sentences(text):
    text = text.replace("\n", ". ")
    parts = re.split(r"[.!?]", text)
    return [p.strip() for p in parts if len(p.strip()) > 8]


# ------------------------------------------------------------------
# PERSONAL DETAILS & NON-TECH DETECTION
# ------------------------------------------------------------------
PERSONAL_KEYWORDS = [
    "name", "phone", "email", "gmail", "linkedin", "dob", "date of birth",
    "address", "age", "contact", "father", "mother", "nationality", "pincode"
]

EDUCATION_KEYWORDS = [
    "bachelor", "master", "degree", "university", "college",
    "percentage", "cgpa", "puc", "sslc", "class x", "class 12",
    "puc ii", "semester", "graduation"
]


def is_personal_or_education(sentence):
    s = sentence.lower()
    return any(k in s for k in PERSONAL_KEYWORDS + EDUCATION_KEYWORDS)


# ------------------------------------------------------------------
# CATEGORY KEYWORD FILTERS (hard rules)
# ------------------------------------------------------------------

ML_KEYWORDS = [
    "machine learning", "ml", "model", "training", "regression", "classification",
    "prediction", "algorithm", "deep learning", "neural network", "xgboost",
    "random forest", "svm", "pipeline", "accuracy", "f1", "precision", "recall"
]

NLP_KEYWORDS = [
    "nlp", "text", "bert", "gpt", "token", "embedding", "sentiment",
    "ner", "transformer", "language model", "translation", "summarization"
]

CV_KEYWORDS = [
    "image", "vision", "opencv", "cnn", "camera", "pixels", "yolo",
    "face detection", "object detection", "segmentation", "frames"
]


def contains_any(sentence, keywords):
    s = sentence.lower()
    return any(k in s for k in keywords)


# ------------------------------------------------------------------
# HYBRID PREDICT WITH ALL FILTERS (THE FIX)
# ------------------------------------------------------------------
def hybrid_predict(sentence):
    # PERSONAL/EDUCATION â†’ ALWAYS OTHER
    if is_personal_or_education(sentence):
        return "OTHER"

    # CLASSIFIER
    X_tfidf = tfidf.transform([sentence])
    act = 1 if action_verbs.intersection(sentence.lower().split()) else 0
    X_final = hstack([X_tfidf, [[act]]])

    probs = clf.predict_proba(X_final)[0]
    pred = clf.predict(X_final)[0]
    confidence = max(probs)

    # RULE 1: Confidence threshold
    if confidence < 0.1:
        return "OTHER"

    # RULE 2: ML keyword validation
    if pred == "ML / Modeling" and not contains_any(sentence, ML_KEYWORDS):
        return "OTHER"

    # RULE 3: NLP keyword validation
    if pred == "NLP" and not contains_any(sentence, NLP_KEYWORDS):
        return "OTHER"

    # RULE 4: Computer Vision keyword validation
    if pred == "Computer Vision" and not contains_any(sentence, CV_KEYWORDS):
        return "OTHER"

    return pred


# ------------------------------------------------------------------
# PYTHON SKILL SCORE
# ------------------------------------------------------------------
PYTHON_KEYS = [
    "python", "numpy", "pandas", "sklearn", "tensorflow", "pytorch",
    "keras", "flask", "fastapi", "django", "matplotlib", "jupyter"
]


def python_score(sentences):
    hits = sum(1 for s in sentences if any(k in s.lower() for k in PYTHON_KEYS))
    return (hits / len(sentences)) * 100 if sentences else 0


# ------------------------------------------------------------------
# PROCESS RESUMES
# ------------------------------------------------------------------
sentence_rows = []
resume_rows = []

for file in os.listdir(RESUME_FOLDER):
    if not file.lower().endswith((".pdf", ".docx", ".txt")):
        continue

    print("Processing:", file)
    path = os.path.join(RESUME_FOLDER, file)
    text = extract_text(path)
    sentences = extract_sentences(text)

    preds = [hybrid_predict(s) for s in sentences]

    # SAVE PER-SENTENCE OUTPUT
    for s, p in zip(sentences, preds):
        sentence_rows.append([file, s, p])

    # SCORE RESUME
    total = len(preds)
    ml = preds.count("ML / Modeling") / total * 100 if total else 0
    nlp = preds.count("NLP") / total * 100 if total else 0
    py = python_score(sentences)

    # Experience calculation
    yrs = 0
    found = re.findall(r"(\d+)\s*(years|year|yrs)", text.lower())
    if found:
        yrs = max(int(x[0]) for x in found)
    exp = 100 if yrs >= 5 else (yrs / 5 * 100)

    overall = (ml + nlp + py + exp) / 4

    resume_rows.append([
        file, round(ml, 2), round(nlp, 2),
        round(py, 2), yrs, round(exp, 2),
        round(overall, 2)
    ])


# ------------------------------------------------------------------
# SAVE OUTPUT
# ------------------------------------------------------------------
pd.DataFrame(sentence_rows, columns=["Resume", "Sentence", "Category"]).to_csv(
    OUTPUT_SENTENCE_CSV, index=False
)

pd.DataFrame(resume_rows, columns=[
    "Resume", "AI/ML Match (%)", "NLP Match (%)", "Python Match (%)",
    "Years Experience", "Experience Score (%)", "Overall Score (%)"
]).to_csv(OUTPUT_RESUME_SCORES, index=False)

print("\nSUPER CLEAN INFERENCE DONE.")
print("Saved:", OUTPUT_SENTENCE_CSV)
print("Saved:", OUTPUT_RESUME_SCORES)
