In [29]:
# (run once in the notebook if packages missing)
# !pip install scikit-learn pandas numpy

# Imports
import random
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)
from sklearn.preprocessing import StandardScaler
random.seed(42)
np.random.seed(42)


# Build synthetic dataset

In [30]:
# Create templates for questions and positive chunks (personal/resume-style)
questions = [
    "Machine learning experience",
    "Python libraries",
    "Education background",
    "Work experience",
    "Projects completed",
    "Research interests",
    "Data engineering skills",
    "Cloud experience",
    "DevOps experience",
    "Publication list"
]

positive_templates = {
    "Machine learning experience": [
        "He has 5 years of ML experience, worked on classification and regression.",
        "Worked with training pipelines, hyperparameter tuning and model evaluation."
    ],
    "Python libraries": [
        "Uses numpy, pandas, scikit-learn, and matplotlib regularly.",
        "Familiar with PyTorch and TensorFlow for model building."
    ],
    "Education background": [
        "Completed BSc in Computer Science from XYZ University, graduated 2020.",
        "MSc in AI with focus on probabilistic models."
    ],
    "Work experience": [
        "Worked as ML Engineer at Acme Corp from 2021 to 2024.",
        "Interned at Data Labs building ETL pipelines and dashboards."
    ],
    "Projects completed": [
        "Built an end-to-end RAG chatbot for enterprise FAQs.",
        "Deployed a recommendation system using collaborative filtering."
    ],
    "Research interests": [
        "Interested in representation learning and retrieval methods.",
        "Exploring contrastive learning for domain adaptation."
    ],
    "Data engineering skills": [
        "Experience with Airflow, Spark, and data lake architectures.",
        "Built ETL pipelines using Python and SQL for high-throughput data."
    ],
    "Cloud experience": [
        "Deployed microservices on AWS and GCP, used serverless functions.",
        "Managed infra via Terraform and monitored with Prometheus."
    ],
    "DevOps experience": [
        "CI/CD pipelines using GitHub Actions and Docker containers.",
        "Automated testing and deployment for ML services."
    ],
    "Publication list": [
        "Co-authored a workshop paper on RAG at a regional conference.",
        "Published blog posts on ML interpretability and retrieval."
    ],
}

# Generic negative chunks (irrelevant)
negative_snippets = [
    "Company cafeteria menu and working hours details.",
    "Office location and parking instructions.",
    "Holiday policy and internal social events.",
    "Financial statements and investor relations notes.",
    "Advertising copy and marketing slogans."
]

# Build dataset: 100 records, balanced positives and negatives across questions
records = []
num_samples = 100
for i in range(num_samples):
    q = random.choice(questions)
    # half positives, half negatives approximately
    label = 1 if i % 2 == 0 else 0
    if label == 1:
        chunk = random.choice(positive_templates[q])
    else:
        # pick a negative snippet or a positive from a different question to make a hard negative
        if random.random() < 0.6:
            chunk = random.choice(negative_snippets)
        else:
            # hard negative: positive text for a different question
            other_q = random.choice([qq for qq in questions if qq != q])
            chunk = random.choice(positive_templates[other_q])
    records.append({"question": q, "chunk": chunk, "label": label})

# Convert to DataFrame
df = pd.DataFrame(records)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)  # shuffle
df.head(8)


Unnamed: 0,question,chunk,label
0,Education background,Office location and parking instructions.,0
1,Education background,Advertising copy and marketing slogans.,0
2,Machine learning experience,"He has 5 years of ML experience, worked on cla...",1
3,DevOps experience,"He has 5 years of ML experience, worked on cla...",0
4,DevOps experience,Automated testing and deployment for ML services.,1
5,Work experience,MSc in AI with focus on probabilistic models.,0
6,Cloud experience,Managed infra via Terraform and monitored with...,1
7,Python libraries,"Uses numpy, pandas, scikit-learn, and matplotl...",1


# Inspect class balance & basic stats

In [31]:
print("Total samples:", len(df))
print(df['label'].value_counts())
print("\nSample grouped counts by question:")
print(df.groupby('question')['label'].value_counts().unstack(fill_value=0))


Total samples: 100
label
0    50
1    50
Name: count, dtype: int64

Sample grouped counts by question:
label                        0   1
question                          
Cloud experience             4   4
Data engineering skills      5   5
DevOps experience            5   7
Education background         6   6
Machine learning experience  6   5
Projects completed           5   3
Publication list             4   4
Python libraries             5  10
Research interests           2   4
Work experience              8   2


# Prepare features (concatenate question + chunk) and vectorize

In [32]:
# Option: combine question and chunk as single text feature (improves classifier's ability to learn relevance)
df['text'] = df['question'] + " " + df['chunk']

vectorizer = TfidfVectorizer(max_features=3000, stop_words='english')
X = vectorizer.fit_transform(df['text'])  # sparse matrix
y = df['label'].values

print("X shape:", X.shape)
print("Sample feature names (first 20):", vectorizer.get_feature_names_out()[:20])


X shape: (100, 142)
Sample feature names (first 20): ['2020' '2021' '2024' 'acme' 'actions' 'adaptation' 'advertising' 'ai'
 'airflow' 'architectures' 'authored' 'automated' 'aws' 'background'
 'blog' 'bsc' 'building' 'built' 'cafeteria' 'cd']


# Train / test split

In [35]:
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X, y, df.index.values, test_size=0.2, random_state=42, stratify=y
)

print("Train samples:", X_train.shape[0])
print("Test samples:", X_test.shape[0])


Train samples: 80
Test samples: 20


# Train classifier (Logistic Regression)

In [36]:
clf = LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42)
clf.fit(X_train, y_train)
print("Model trained.")


Model trained.


# Evaluate on test set and print metrics + confusion matrix

In [37]:
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:,1]

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
cm = confusion_matrix(y_test, y_pred)

print("Accuracy:", round(acc, 3))
print("Precision:", round(prec, 3))
print("Recall:", round(rec, 3))
print("F1:", round(f1, 3))
print("\nConfusion matrix:\n", cm)


Accuracy: 0.9
Precision: 0.9
Recall: 0.9
F1: 0.9

Confusion matrix:
 [[9 1]
 [1 9]]


# Use the trained model to rank candidate chunks for a new question

In [54]:
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Candidate pool
candidate_chunks = df['chunk'].tolist()
candidate_files = df['question'].tolist()  # using question as doc id

# Step 1: Encode chunks
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
chunk_vectors = embed_model.encode(candidate_chunks, convert_to_numpy=True)

# Step 2: Train a simple classifier on your labeled data
# Assume df has columns: ['question', 'chunk', 'label']
X_train = embed_model.encode(df['question'] + " " + df['chunk'], convert_to_numpy=True)
y_train = df['label'].values
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Step 3: Rank candidates for a new question
def rank_candidates_for_question(question_text, top_k=5):
    # Encode question + candidate chunks for classifier
    query_vectors = embed_model.encode([question_text + " " + chunk for chunk in candidate_chunks], convert_to_numpy=True)
    probs = clf.predict_proba(query_vectors)[:,1]

    # Compute cosine similarity between question and chunks
    q_vec = embed_model.encode([question_text], convert_to_numpy=True)
    sims = cosine_similarity(q_vec, chunk_vectors).flatten()

    # Rank by classifier probability, tie-breaker similarity
    idxs = np.argsort(np.stack([probs, sims]).T[:,0])[::-1]
    top = []
    for i in idxs[:top_k]:
        top.append({
            "chunk": candidate_chunks[i],
            "source_question": candidate_files[i],
            "prob": float(probs[i]),
            "sim": float(sims[i])
        })
    return top

# Example usage
question = "Weather?"
top_results = rank_candidates_for_question(question, top_k=5)
for i, r in enumerate(top_results, 1):
    print(f"Rank {i}: prob={r['prob']:.3f}, sim={r['sim']:.3f}")
    print("Chunk:", r['chunk'])
    print("Source Question:", r['source_question'])
    print("----")


Rank 1: prob=0.406, sim=0.111
Chunk: Used numpy, pandas, sklearn...
Source Question: Python libraries
----
Rank 2: prob=0.379, sim=0.054
Chunk: He has 5 years of ML experience...
Source Question: Machine learning experience
----
Rank 3: prob=0.319, sim=0.068
Chunk: Company history overview...
Source Question: Machine learning experience
----
Rank 4: prob=0.292, sim=0.022
Chunk: Employee benefits details...
Source Question: Python libraries
----
Rank 5: prob=0.281, sim=0.047
Chunk: The office parking lot is empty.
Source Question: Python libraries
----
