In [None]:
# resume_pipeline.py
import os
import random
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import matplotlib.pyplot as plt

random.seed(42)
np.random.seed(42)

# -------- CONFIG --------
INPUT_CSV = "big_resume_training_data.csv"   # change if your file has different name or path
OUTPUT_DIR = "resume_pipeline_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

TFIDF_MAX_FEATURES = 10000
TFIDF_NGRAM_RANGE = (1, 2)

# Hybrid weights (tunable)
W_CLF = 0.45
W_COS = 0.30
W_ACTION = 0.15
W_KW = 0.10

# Action verbs (simple list â€” expand for better recall)
ACTION_VERBS = {
    "developed", "implemented", "deployed", "designed", "built", "trained",
    "optimized", "created", "engineered", "led", "improved", "used"
}

# -------- Helper functions --------
def action_verb_flag(sentence: str) -> int:
    tokens = set(sentence.lower().replace(';',' ').replace('.', ' ').replace(',', ' ').split())
    return int(len(ACTION_VERBS.intersection(tokens)) > 0)

def keyword_predict_simple(sentence: str, categories: list) -> (str, dict):
    """Very simple baseline: count tokens from category name within sentence."""
    s = sentence.lower()
    counts = {}
    for cat in categories:
        toks = cat.lower().split()
        counts[cat] = sum(s.count(t) for t in toks)
    best = max(counts, key=counts.get)
    return best, counts

def keyword_evidence(sentence: str, categories: list):
    """Normalized fraction of category-name tokens present in sentence."""
    s = sentence.lower()
    evidence = {}
    for cat in categories:
        toks = cat.lower().split()
        evidence[cat] = sum(1 for t in toks if t in s) / max(1, len(toks))
    return evidence

# -------- Main pipeline --------
def main():
    print("Loading CSV:", INPUT_CSV)
    df = pd.read_csv(INPUT_CSV)
    if "sentence" not in df.columns or "label" not in df.columns:
        raise ValueError("CSV must contain 'sentence' and 'label' columns")

    categories = list(df["label"].unique())
    print("Detected categories:", categories)

    # Build simple category descriptions (here a trivial one; replace with richer descriptions if available)
    category_descriptions = {c: " ".join(c.lower().split()) for c in categories}

    # Train/test split (stratified)
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])
    test_df = test_df.copy().reset_index(drop=True)

    # Baseline predictions
    baseline_preds = []
    for _, row in test_df.iterrows():
        pred, _ = keyword_predict_simple(row["sentence"], categories)
        baseline_preds.append(pred)
    test_df["baseline_pred"] = baseline_preds
    baseline_acc = accuracy_score(test_df["label"], test_df["baseline_pred"])

    # TF-IDF
    tfidf = TfidfVectorizer(ngram_range=TFIDF_NGRAM_RANGE, max_features=TFIDF_MAX_FEATURES)
    tfidf.fit(train_df["sentence"])

    # Vectorize category descriptions
    cat_desc_texts = [category_descriptions[c] for c in categories]
    cat_desc_vecs = tfidf.transform(cat_desc_texts)

    # Action flag feature
    train_action = train_df["sentence"].apply(action_verb_flag).values.reshape(-1,1)
    test_action = test_df["sentence"].apply(action_verb_flag).values.reshape(-1,1)

    # TF-IDF features
    X_train_tfidf = tfidf.transform(train_df["sentence"])
    X_test_tfidf = tfidf.transform(test_df["sentence"])

    X_train = hstack([X_train_tfidf, train_action])
    X_test = hstack([X_test_tfidf, test_action])

    # Labels
    le = LabelEncoder()
    y_train = le.fit_transform(train_df["label"])
    y_test = le.transform(test_df["label"])

    # Train Logistic Regression
    clf = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs', random_state=42)
    clf.fit(X_train, y_train)

    clf_preds = le.inverse_transform(clf.predict(X_test))
    clf_probs = clf.predict_proba(X_test)
    test_df["clf_pred"] = clf_preds
    test_df["clf_confidence"] = clf_probs.max(axis=1)

    # Cosine similarity predictions
    cos_sims = cosine_similarity(X_test_tfidf, cat_desc_vecs)  # sentence vs category desc
    cos_df = pd.DataFrame(cos_sims, columns=categories, index=test_df.index)
    test_df["cos_pred"] = cos_df.idxmax(axis=1)
    test_df["cos_max"] = cos_df.max(axis=1)

    # Keyword evidence fields
    kw_list = [keyword_evidence(s, categories) for s in test_df["sentence"]]
    kw_df = pd.DataFrame(kw_list, index=test_df.index)
    test_df = pd.concat([test_df, kw_df.add_prefix("kw_")], axis=1)
    test_df["kw_top_cat"] = kw_df.idxmax(axis=1)
    test_df["kw_max"] = kw_df.max(axis=1)

    # Hybrid scoring
    hybrid_preds = []
    for pos, (i, row) in enumerate(test_df.iterrows()):
        sentence = row["sentence"]
        scores = {}
        for cat in categories:
            cat_idx = list(le.classes_).index(cat)
            clf_conf = clf_probs[pos, cat_idx]
            cos_sim = cos_df.loc[i, cat]
            action = action_verb_flag(sentence)
            kw_ev = kw_df.loc[i, cat]
            total_score = W_CLF * clf_conf + W_COS * cos_sim + W_ACTION * action + W_KW * kw_ev
            scores[cat] = total_score
        best = max(scores, key=scores.get)
        hybrid_preds.append(best)
    test_df["hybrid_pred"] = hybrid_preds

    # Metrics
    baseline_acc = accuracy_score(test_df["label"], test_df["baseline_pred"])
    clf_acc = accuracy_score(test_df["label"], test_df["clf_pred"])
    cos_acc = accuracy_score(test_df["label"], test_df["cos_pred"])
    hybrid_acc = accuracy_score(test_df["label"], test_df["hybrid_pred"])

    acc_table = pd.DataFrame({
        "model": ["Keyword-only (baseline)", "Logistic Regression (clf only)", "Cosine-similarity (alone)", "Hybrid (combined)"],
        "accuracy": [baseline_acc, clf_acc, cos_acc, hybrid_acc]
    })

    print("\nAccuracy comparison:")
    print(acc_table.to_string(index=False))

    print("\nHybrid classification report:")
    print(classification_report(test_df["label"], test_df["hybrid_pred"], digits=4))

    # Save artifacts
    artifacts = {
        "tfidf": tfidf,
        "logistic_clf": clf,
        "label_encoder": le,
        "categories": categories,
        "action_verbs": list(ACTION_VERBS),
        "weights": {"clf": W_CLF, "cos": W_COS, "action": W_ACTION, "kw": W_KW}
    }
    artifacts_path = os.path.join(OUTPUT_DIR, "resume_pipeline_artifacts.joblib")
    joblib.dump(artifacts, artifacts_path)
    preds_path = os.path.join(OUTPUT_DIR, "resume_pipeline_test_predictions.csv")
    test_df.to_csv(preds_path, index=False)

     

    print("\nSaved artifacts:", artifacts_path)
    print("Saved test predictions CSV:", preds_path)
    print("Saved accuracy chart:", chart_path)

if __name__ == "__main__":
    main()
