In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df_lit = load_and_clean_data("/content/drive/MyDrive/Data_Mining/Files/Literature/Literature_CSV_raw/posts_cleanbody.csv")
print(len(dd))

# Basic

In [None]:
# Cell 1: Imports and basic utilities
import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# ─────────────────────────────────────────────────────
def load_and_clean_data(filepath, text_column='CleanBodyNoMath', tag_column='Tags'):
    """
    Load CSV, split tags on pipes, drop rows with no tags.

    Returns DataFrame with only text_column and tag_column.
    """
    df = pd.read_csv(filepath)

    def split_pipes(x):
        if isinstance(x, list):
            return x
        if pd.isna(x):
            return []
        return [p for p in x.strip("|").split("|") if p]

    df[tag_column] = df[tag_column].apply(split_pipes)
    df = df[df[tag_column].map(len) > 0].reset_index(drop=True)
    return df[[text_column, tag_column]]

# ─────────────────────────────────────────────────────
def embed_texts(texts,
                model_name='all-MiniLM-L6-v2',
                batch_size=64,
                device='cpu'):
    """
    Encode a list of texts into embeddings using SentenceTransformer.
    Returns (embeddings, model).
    """
    model = SentenceTransformer(model_name, device=device)
    embeddings = model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True
    )
    return embeddings, model


In [None]:
def run_pipeline_embeddings_only(csv_path=None,
                                 df=None,
                                 sample_size=None,
                                 seed=None,
                                 device='cpu'):
    """
    Load data, optionally sample, embed all posts.
    Returns:
      - df: cleaned DataFrame
      - X: numpy array of embeddings
      - model: SentenceTransformer instance
    """
    if df is None:
        if csv_path is None:
            raise ValueError("Either 'csv_path' or 'df' must be provided.")
        df = load_and_clean_data(csv_path)

    if sample_size is not None:
        df = df.sample(sample_size, random_state=seed).reset_index(drop=True)
    else:
        df = df.reset_index(drop=True)

    texts = df['CleanBodyNoMath'].fillna('').tolist()
    X, model = embed_texts(texts, device=device)

    return {
        'df': df,
        'X': X,
        'model': model
    }


def recommend_similar_posts(query_text,
                            df,
                            X_embeddings,
                            model,
                            top_n=5):
    """
    Given a query string, embed it and compute cosine similarity
    to all stored embeddings. Return top_n posts.
    """
    # embed the query
    query_emb, = model.encode([query_text], convert_to_numpy=True)
    # compute similarities
    sims = cosine_similarity(query_emb.reshape(1, -1), X_embeddings)[0]
    # pick top N indices
    idxs = sims.argsort()[-top_n:][::-1]

    recommendations = []
    for i in idxs:
        recommendations.append({
            'similarity': float(sims[i]),
            'text': df.iloc[i]['CleanBodyNoMath'],
            'tags': df.iloc[i]['Tags']
        })
    return recommendations


## Embed and Save

In [None]:
# resources = run_pipeline_embeddings_only(
#     csv_path="/content/drive/MyDrive/Data_Mining/Files/Physics/Posts_with_sentiment.csv",
#     sample_size=None,
#     seed=42,
#     device = "cuda"
# )
# resources['df'].to_csv("/content/drive/MyDrive/Data_Mining/Files/Physics/physics_post_cleaned_all.csv", index=False)
# np.save("/content/drive/MyDrive/Data_Mining/Files/Physics/physics_posts_embeddings_.npy", resources['X'])
# print("DONO")

## Load embeddings

In [None]:
# Load DataFrame
!ls /content/drive/MyDrive/Data_Mining/Files/Physics/

df = pd.read_csv("/content/drive/MyDrive/Data_Mining/Files/Physics/physics_post_cleaned_all.csv")
X = np.load("/content/drive/MyDrive/Data_Mining/Files/Physics/physics_posts_embeddings_.npy")


# Load model (can be CPU now)
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2", device='cpu')  # or 'cuda'

# Now reuse in recommend
recs = recommend_similar_posts(
    "What is the uncertainty principle?",
    df,
    X,
    model
)


'DM_Dataset\posts_to_users_with_lastdate.csv'
'Explaination of posts_with_comments_and_users.gdoc'
 Merged_posts_to_users.csv
 Physics_Comments.csv
 physics_post_cleaned_all.csv
 physics_posts_embeddings_.npy
 Posts_with_sentiment.csv
 UserAggregatedWithChurn.csv
 user_churn.ipynb
 UsersWithSentimentAndCountry.csv
 Votes.csv


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
ART_DIR = '/content/drive/MyDrive/Data_Mining/Physics/tag_recommender_artifacts'
from sentence_transformers import SentenceTransformer
from joblib import load
import json

# Load the transformer model
model = SentenceTransformer(f'{ART_DIR}/transformer')

# Load classifiers and binarizers
classifiers = load(f'{ART_DIR}/classifiers.joblib')
mlb         = load(f'{ART_DIR}/mlb.joblib')
mlb_valid   = load(f'{ART_DIR}/mlb_valid.joblib')

# Load valid tag indices
with open(f'{ART_DIR}/valid_tag_indices.json') as f:
    valid_tag_indices = json.load(f)

print("✅ Model and components loaded successfully!")



✅ Model and components loaded successfully!


In [None]:
query = "Explain the many worlds interpretation?"

recs = recommend_similar_posts(
    query,
    df,
    X,
    model,
    top_n=5
)

for rec in recs:
    snippet = rec['text'][:300].replace('\n', ' ')
    print(f"🔎 Similarity: {rec['similarity']:.3f}")
    print(f"🏷️ Tags: {rec['tags']}")
    print(f"📝 Post snippet: {snippet}")
    print("-" * 80)


🔎 Similarity: 0.774
🏷️ Tags: ['quantum-mechanics', 'quantum-interpretations']
📝 Post snippet: Is the many-worlds interpretation just a different interpretation to quantum mechanics or does it contain some different predictions? In other words, is it possible theoretically to conduct an experiment that checks the many-worlds interpretation? 
--------------------------------------------------------------------------------
🔎 Similarity: 0.767
🏷️ Tags: ['quantum-mechanics', 'quantum-interpretations']
📝 Post snippet: I was reading this interpretation from this site, where these lines are noteworthy enough to talk for the fact that this interpretation doesn't actually talk about many-worlds:  These are the "many worlds" in question, although it should be clear that the label is somewhat misleading. People sometim
--------------------------------------------------------------------------------
🔎 Similarity: 0.741
🏷️ Tags: ['quantum-mechanics', 'quantum-interpretations']
📝 Post snippet: I woul

# Function to show similar and dissimilar posts

In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# Install dependencies
!pip install -q sentence-transformers joblib

# 1. Mount Drive if in Colab (otherwise skip)
try:
    from google.colab import drive
    drive.mount('/content/drive')
    drive_base = '/content/drive/MyDrive/Data_Mining'
except ImportError:
    drive_base = '.'  # adjust if running locally

import pandas as pd
import numpy as np
import json
from joblib import load
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# 2. Paths to saved artifacts and data
ART_DIR = f'{drive_base}/Physics/tag_recommender_artifacts'
DF_PATH = f'{drive_base}/Files/Physics/physics_post_cleaned_all.csv'
X_PATH  = f'{drive_base}/Files/Physics/physics_posts_embeddings_.npy'

# 3. Load data + embeddings
df = pd.read_csv(DF_PATH)
X  = np.load(X_PATH)

# 4. Load the encoder model
embed_model = SentenceTransformer(f'{ART_DIR}/transformer', device='cpu')

# 5. Load your tag-classifier pipeline
classifiers      = load(f'{ART_DIR}/classifiers.joblib')
mlb_valid        = load(f'{ART_DIR}/mlb_valid.joblib')
with open(f'{ART_DIR}/valid_tag_indices.json') as fp:
    valid_tag_indices = json.load(fp)

# ──────────────────────────────────────────────────────────────────────────────
# Helper: get top-k by embedding similarity
def get_top_k_by_embedding(query, df, X, embed_model, k=50):
    q_emb = embed_model.encode([query], convert_to_numpy=True)
    sims  = cosine_similarity(q_emb, X)[0]
    idxs  = sims.argsort()[-k:][::-1]
    return [
        {'index': int(i),
         'text': df.iloc[i]['CleanBodyNoMath'],
         'tags': df.iloc[i]['Tags'],
         'sim': float(sims[i])}
        for i in idxs
    ]

# Helper: predict top-k tags via your classification pipeline
def predict_tags(text, embed_model, classifiers, valid_tag_indices, mlb_valid, top_k=5):
    emb = embed_model.encode([text], convert_to_numpy=True)
    probs = []
    for j, idx in enumerate(valid_tag_indices):
        clf = classifiers[idx]
        p   = clf.predict_proba(emb)[:,1][0] if clf is not None else 0.0
        probs.append(p)
    class_names = list(mlb_valid.classes_)
    top_idxs = np.argsort(probs)[-top_k:][::-1]
    return [class_names[i] for i in top_idxs]

# Hybrid recommender: combine embedding sim + tag overlap
def recommend_hybrid(query, df, X, embed_model, classifiers, valid_tag_indices, mlb_valid,
                     top_k_candidates=50, top_n=5, alpha=0.6):
    # 1) initial embedding retrieval
    cands = get_top_k_by_embedding(query, df, X, embed_model, k=top_k_candidates)
    # 2) predict tags for query
    q_tags = set(predict_tags(query, embed_model, classifiers, valid_tag_indices, mlb_valid, top_k=5))
    # 3) compute hybrid scores
    hybrid = []
    for c in cands:
        cand_tags = set(predict_tags(c['text'], embed_model, classifiers, valid_tag_indices, mlb_valid, top_k=5))
        tag_score = len(q_tags & cand_tags) / len(q_tags) if q_tags else 0.0
        final_score = alpha * c['sim'] + (1 - alpha) * tag_score
        hybrid.append({
            **c,
            'tag_score': tag_score,
            'final_score': final_score,
            'predicted_tags': list(cand_tags)
        })
    # 4) pick top_n
    hybrid.sort(key=lambda x: x['final_score'], reverse=True)
    return hybrid[:top_n]

# ──────────────────────────────────────────────────────────────────────────────


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m62.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Sample 1000 posts
N = 20
sample_df = df.sample(n=N, random_state=42).reset_index(drop=True)

top_scores = []

for i, row in sample_df.iterrows():
    # Print progress every 10 posts
    if i % 10 == 0:
        print(f"✅ Processed {i} / {N}")

    # Get top neighbors using hybrid method
    neighbors = recommend_hybrid(
        row['CleanBodyNoMath'], df, X, embed_model,
        classifiers, valid_tag_indices, mlb_valid,
        top_k_candidates=50, top_n=6, alpha=0.6  # 6 because first is self
    )

    if len(neighbors) <= 1:
        continue

    top_neighbor = neighbors[1]  # Skip self-match

    top_scores.append({
        'index': i,
        'top_score': top_neighbor['final_score'],
        'post': row,
        'neighbors': neighbors[1:6]
    })

# Sort and pick top 3 posts
top3 = sorted(top_scores, key=lambda x: x['top_score'], reverse=True)[:3]


for i, item in enumerate(top3, 1):
    post = item['post']
    print(f"\n🟢 Top Post #{i} (Index {item['index']})")
    print(f"🔸 Top Neighbor Score: {item['top_score']:.3f}")
    print(f"🏷️ Tags: {post['Tags']}")
    print(f"📝 Post Text:\n{post['CleanBodyNoMath'][:500].replace(chr(10), ' ')}...\n")

    print("🔹 Top 5 Hybrid Neighbors:")
    for j, n in enumerate(item['neighbors'], 1):
        print(f"  {j}. final={n['final_score']:.3f} | sim={n['sim']:.3f} | tag_score={n['tag_score']:.3f}")
        print(f"     Tags: {n['tags']}")
        print(f"     PredTags: {n['predicted_tags']}")
        print(f"     📝 {n['text'][:400].replace(chr(10), ' ')}...\n")

✅ Processed 0 / 20


KeyboardInterrupt: 

# Optimized

In [None]:
# 4) Optimized helper: batch tag prediction
!pip install -q sentence-transformers joblib

def batch_predict_tags_all_optimized(X, classifiers, valid_tag_indices, mlb_valid, top_k=5):
    probs = []
    for idx in valid_tag_indices:
        clf = classifiers[idx]
        probs.append(clf.predict_proba(X)[:,1] if clf else np.zeros(X.shape[0]))
    M = np.vstack(probs).T
    names = mlb_valid.classes_
    out = []
    for row in M:
        top_idxs = np.argsort(row)[-top_k:][::-1]
        out.append(set(names[i] for i in top_idxs))
    return out

# 5) Optimized hybrid recommender
def recommend_hybrid_optimized(query_idx, df, X, predicted_tags_all,
                               top_k_candidates=50, top_n=5, alpha=0.6):
    sims = cosine_similarity([X[query_idx]], X)[0]
    sims[query_idx] = -1.0
    # use dynamic max_k
    max_k = min(top_k_candidates, X.shape[0]-1)
    inds = np.argpartition(-sims, max_k)[:max_k]
    inds = inds[np.argsort(sims[inds])[::-1]]
    qtags = predicted_tags_all[query_idx]
    res = []
    for i in inds:
        tags_i = predicted_tags_all[i]
        tag_score = len(qtags & tags_i) / (len(qtags) or 1)
        final = alpha * sims[i] + (1-alpha) * tag_score
        res.append({
            'index': i,
            'sim': float(sims[i]),
            'tag_score': tag_score,
            'final_score': final,
            'tags': df.iloc[i]['Tags'],
            'predicted_tags': list(tags_i),
            'text': df.iloc[i]['CleanBodyNoMath']
        })
    return sorted(res, key=lambda x: x['final_score'], reverse=True)[:top_n]

# Top 3 of 10 000,

In [None]:

N = 10000
batch_size = 100
top_scores = []

# Subset and precompute tags
sub_df = df.head(N).reset_index(drop=True)
sub_X  = X[sub_df.index]
pred_tags = batch_predict_tags_all_optimized(sub_X, classifiers, valid_tag_indices, mlb_valid, top_k=5)

for start in range(0, N, batch_size):
    end = min(start+batch_size, N)
    print(f"🚀 Processing batch {start} to {end-1}...")
    for i in range(start, end):
        neigh = recommend_hybrid_optimized(
            query_idx=i, df=sub_df, X=sub_X, predicted_tags_all=pred_tags,
            top_k_candidates=50, top_n=6, alpha=0.6
        )
        if len(neigh)<=1: continue
        top_scores.append({'index':i,'top_score':neigh[1]['final_score'],'post':sub_df.iloc[i],'neighbor':neigh[1]})

# ──────────────────────────────────────────────────────────────────────────────
# 6) Display top 3 posts and their top 3 neighbors

for rank, item in enumerate(sorted(top_scores, key=lambda x: x['top_score'], reverse=True)[:3], 1):
    idx = item['index']
    post = item['post']

    # Print the original post
    print(f"\n🔥 Post #{rank} (Idx {idx}):")
    print(f"  🏷️ Tags: {post['Tags']}")
    print(f"  📝 Post Text:\n{post['CleanBodyNoMath']}\n")

    # Re-fetch its top 3 neighbors
    top3 = recommend_hybrid_optimized(
        query_idx=idx,
        df=sub_df,
        X=sub_X,
        predicted_tags_all=pred_tags,
        top_k_candidates=50,
        top_n=3,
        alpha=0.6
    )

    # Print each neighbor
    for j, n in enumerate(top3, 1):
        print(f"🔹 Neighbor #{j} (final={n['final_score']:.3f}, sim={n['sim']:.3f}, tag_score={n['tag_score']:.3f})")
        print(f"     Tags: {n['tags']}")
        print(f"     PredTags: {n['predicted_tags']}")
        print(f"     📝 Neighbor Text:\n{n['text']}\n")


🚀 Processing batch 0 to 99...
🚀 Processing batch 100 to 199...
🚀 Processing batch 200 to 299...
🚀 Processing batch 300 to 399...
🚀 Processing batch 400 to 499...
🚀 Processing batch 500 to 599...
🚀 Processing batch 600 to 699...
🚀 Processing batch 700 to 799...
🚀 Processing batch 800 to 899...
🚀 Processing batch 900 to 999...
🚀 Processing batch 1000 to 1099...
🚀 Processing batch 1100 to 1199...
🚀 Processing batch 1200 to 1299...
🚀 Processing batch 1300 to 1399...
🚀 Processing batch 1400 to 1499...
🚀 Processing batch 1500 to 1599...
🚀 Processing batch 1600 to 1699...
🚀 Processing batch 1700 to 1799...
🚀 Processing batch 1800 to 1899...
🚀 Processing batch 1900 to 1999...
🚀 Processing batch 2000 to 2099...
🚀 Processing batch 2100 to 2199...
🚀 Processing batch 2200 to 2299...
🚀 Processing batch 2300 to 2399...
🚀 Processing batch 2400 to 2499...
🚀 Processing batch 2500 to 2599...
🚀 Processing batch 2600 to 2699...
🚀 Processing batch 2700 to 2799...
🚀 Processing batch 2800 to 2899...
🚀 Proces

# Evaluate percentile, median average


In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# 1) Subset and precompute
N = 200000
sub_df = df.head(N).reset_index(drop=True)
sub_X  = X[sub_df.index]

pred_tags = batch_predict_tags_all_optimized(
    sub_X, classifiers, valid_tag_indices, mlb_valid, top_k=5
)

# ──────────────────────────────────────────────────────────────────────────────
# 2) Compute top‐neighbor scores
import numpy as np

scores = np.full(N, np.nan)
for i in range(N):
    if i % 1000 == 0:
        print(f"Processing {i}/{N}")
    neigh = recommend_hybrid_optimized(
        query_idx=i,
        df=sub_df,
        X=sub_X,
        predicted_tags_all=pred_tags,
        top_k_candidates=50,
        top_n=6,
        alpha=0.6
    )
    if len(neigh) > 1:
        scores[i] = neigh[1]['final_score']

# ──────────────────────────────────────────────────────────────────────────────
# 3) Compute statistics
valid = ~np.isnan(scores)
vals  = scores[valid]
mean_score   = vals.mean()
median_score = np.median(vals)
pct25, pct75 = np.percentile(vals, [25, 75])

print(f"\nAverage top-neighbor score: {mean_score:.3f}")
print(f"Median  top-neighbor score: {median_score:.3f}")
print(f"25th percentile: {pct25:.3f}")
print(f"75th percentile: {pct75:.3f}")

# ──────────────────────────────────────────────────────────────────────────────
# 4) Helper to show an example
def show_example(label, target):
    idx = np.nanargmin(np.abs(scores - target))
    post = sub_df.iloc[idx]
    print(f"\n=== {label} Example (Idx {idx}) | Score {scores[idx]:.3f} ===")
    print("🏷️ Tags:", post['Tags'])
    print("📝 Post Text:\n", post['CleanBodyNoMath'], "\n")
    neighs = recommend_hybrid_optimized(
        query_idx=idx,
        df=sub_df,
        X=sub_X,
        predicted_tags_all=pred_tags,
        top_k_candidates=50,
        top_n=6,
        alpha=0.4
    )
    print("🔹 Top 5 Hybrid Neighbors:")
    for j, n in enumerate(neighs[1:], 1):
        print(f"  {j}. final={n['final_score']:.3f} | sim={n['sim']:.3f} | tag_score={n['tag_score']:.3f}")
        print("     Tags:", n['tags'])
        print("     PredTags:", n['predicted_tags'])
        print("     📝", n['text'], "\n")

# ──────────────────────────────────────────────────────────────────────────────
# 5) Show four key examples
show_example("Average", mean_score)
show_example("Median", median_score)
show_example("25th percentile", pct25)
show_example("75th percentile", pct75)


KeyboardInterrupt: 

In [None]:
# 0. Parameters and Setup
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

N = 200000             # Number of posts to process
batch_size = 1000      # Batch size for vectorized scoring
top_k_candidates = 50 # Number of top similar posts to consider
top_n = 6             # Top N neighbors to keep (1 is self-match)
alpha = 0.6           # Weight for similarity vs tag score

# Subset data and embeddings
sub_df = df.head(N).reset_index(drop=True)
sub_X  = X[sub_df.index]

# ──────────────────────────────────────────────────────────────────────────────
# 1. Optimized tag prediction
def batch_predict_tags_all_optimized(X, classifiers, valid_tag_indices, mlb_valid, top_k=5):
    probs = []
    for idx in valid_tag_indices:
        clf = classifiers[idx]
        if clf is not None:
            probs.append(clf.predict_proba(X)[:, 1])
        else:
            probs.append(np.zeros(X.shape[0]))
    probs = np.vstack(probs).T
    class_names = list(mlb_valid.classes_)
    top_k_tags = []
    for row in probs:
        top_idxs = np.argsort(row)[-top_k:][::-1]
        tags = [class_names[i] for i in top_idxs]
        top_k_tags.append(set(tags))
    return top_k_tags

# ──────────────────────────────────────────────────────────────────────────────
# 2. Vectorized hybrid recommendation
def compute_top_neighbor_scores_vectorized(sub_X, pred_tags, df, batch_size=1000, top_k_candidates=50, top_n=6, alpha=0.6):
    N = sub_X.shape[0]
    scores = np.full(N, np.nan)

    for start in range(0, N, batch_size):
        end = min(start + batch_size, N)
        if(start % 10000 == 0):
            print(f"🚀 Vectorized Processing {start} to {end-1}")

        batch_X = sub_X[start:end]

        sims = cosine_similarity(batch_X, sub_X)
        np.fill_diagonal(sims[:, start:end], -1.0)  # exclude self-matches in batch

        for i in range(end - start):
            sim_row = sims[i]
            q_tags = pred_tags[start + i]

            # Get top-k candidates (excluding self)
            cand_idxs = np.argpartition(-sim_row, top_k_candidates)[:top_k_candidates]
            cand_idxs = cand_idxs[np.argsort(sim_row[cand_idxs])[::-1]]

            final_scores = []
            for idx in cand_idxs:
                c_tags = pred_tags[idx]
                tag_score = len(q_tags & c_tags) / len(q_tags) if q_tags else 0.0
                final_score = alpha * sim_row[idx] + (1 - alpha) * tag_score
                final_scores.append(final_score)

            if len(final_scores) > 1:
                scores[start + i] = sorted(final_scores, reverse=True)[1]  # top neighbor (excluding self)

    return scores



In [None]:
pred_tags = batch_predict_tags_all_optimized(sub_X, classifiers, valid_tag_indices, mlb_valid, top_k=5)


In [None]:
# 4. Show examples
def show_example(label, target):
    idx = np.nanargmin(np.abs(scores - target))
    post = sub_df.iloc[idx]
    print(f"\n=== {label} Example (Idx {idx}) | Score {scores[idx]:.3f} ===")
    print("🏷️ Tags:", post['Tags'])
    print("📝 Post Text:\n", post['CleanBodyNoMath'][:500].replace('\n', ' '), "\n")
    neighs = recommend_hybrid_optimized(
        query_idx=idx,
        df=sub_df,
        X=sub_X,
        predicted_tags_all=pred_tags,
        top_k_candidates=50,
        top_n=6,
        alpha=alpha
    )
    print("🔹 Top 5 Hybrid Neighbors:")
    for j, n in enumerate(neighs[1:], 1):
        print(f"  {j}. final={n['final_score']:.3f} | sim={n['sim']:.3f} | tag_score={n['tag_score']:.3f}")
        print("     Tags:", n['tags'])
        print("     PredTags:", n['predicted_tags'])
        print("     📝", n['text'][:400].replace('\n',' '), "\n")


In [None]:
# 3. Predict tags and run vectorized score computation
scores = compute_top_neighbor_scores_vectorized(
    sub_X, pred_tags, df=sub_df,
    batch_size=batch_size, top_k_candidates=top_k_candidates, top_n=top_n, alpha=alpha
)
# 3) Compute and display statistics
valid = ~np.isnan(scores)
vals = scores[valid]
mean_score   = vals.mean()
median_score = np.median(vals)
pct25, pct75 = np.percentile(vals, [25, 75])

print(f"\n✅ Summary of Top-Neighbor Final Scores")
print(f"Average  : {mean_score:.3f}")
print(f"Median   : {median_score:.3f}")
print(f"25th pct : {pct25:.3f}")
print(f"75th pct : {pct75:.3f}")

# Show all 4
show_example("Average", mean_score)
show_example("Median", median_score)
show_example("25th percentile", pct25)
show_example("75th percentile", pct75)

🚀 Vectorized Processing 0 to 999
🚀 Vectorized Processing 10000 to 10999
🚀 Vectorized Processing 20000 to 20999
🚀 Vectorized Processing 30000 to 30999
🚀 Vectorized Processing 40000 to 40999
🚀 Vectorized Processing 50000 to 50999
🚀 Vectorized Processing 60000 to 60999
🚀 Vectorized Processing 70000 to 70999
🚀 Vectorized Processing 80000 to 80999
🚀 Vectorized Processing 90000 to 90999
🚀 Vectorized Processing 100000 to 100999
🚀 Vectorized Processing 110000 to 110999
🚀 Vectorized Processing 120000 to 120999
🚀 Vectorized Processing 130000 to 130999
🚀 Vectorized Processing 140000 to 140999
🚀 Vectorized Processing 150000 to 150999
🚀 Vectorized Processing 160000 to 160999
🚀 Vectorized Processing 170000 to 170999
🚀 Vectorized Processing 180000 to 180999
🚀 Vectorized Processing 190000 to 190999

✅ Summary of Top-Neighbor Final Scores
Average  : 0.688
Median   : 0.691
25th pct : 0.637
75th pct : 0.743

=== Average Example (Idx 4136) | Score 0.688 ===
🏷️ Tags: ['orbital-motion', 'tidal-effect']
📝 Po

# Show 3 instead of 1



In [None]:
import random

def show_examples_per_category(label, target_score, scores, sub_df, sub_X, pred_tags,
                                alpha=0.6, delta=0.01, num_examples=5):
    # Find indices of posts with scores close to target
    close_idxs = np.where(np.abs(scores - target_score) < delta)[0]
    if len(close_idxs) == 0:
        print(f"\n⚠️ No examples found for {label} (target score: {target_score:.3f})")
        return

    # Randomly select up to 3 matching posts
    selected_idxs = random.sample(list(close_idxs), min(num_examples, len(close_idxs)))

    print(f"\n🎯 {label} Examples (Target score ≈ {target_score:.3f}, ±{delta}) — Showing {len(selected_idxs)} Matches")

    for k, idx in enumerate(selected_idxs, 1):
        post = sub_df.iloc[idx]
        print(f"\n🟢 {label} Post #{k} (Index {idx}) | Score = {scores[idx]:.3f}")
        print("🏷️ Tags:", post['Tags'])
        print("📝", post['CleanBodyNoMath'][:600].replace('\n', ' '), "...\n")

        neighbors = recommend_hybrid_optimized(
            query_idx=idx,
            df=sub_df,
            X=sub_X,
            predicted_tags_all=pred_tags,
            top_k_candidates=50,
            top_n=6,
            alpha=alpha
        )

        print("🔹 Top 5 Hybrid Neighbors:")
        for j, n in enumerate(neighbors[1:], 1):  # skip the post itself
            print(f"  {j}. final={n['final_score']:.3f} | sim={n['sim']:.3f} | tag_score={n['tag_score']:.3f}")
            print("     Tags:", n['tags'])
            print("     PredTags:", n['predicted_tags'])
            print("     📝", n['text'][:600].replace('\n', ' '), "...\n")


In [None]:
show_examples_per_category("Average", mean_score, scores, sub_df, sub_X, pred_tags, alpha=alpha)
show_examples_per_category("Median", median_score, scores, sub_df, sub_X, pred_tags, alpha=alpha)
show_examples_per_category("25th percentile", pct25, scores, sub_df, sub_X, pred_tags, alpha=alpha)
show_examples_per_category("75th percentile", pct75, scores, sub_df, sub_X, pred_tags, alpha=alpha)



🎯 Average Examples (Target score ≈ 0.688, ±0.01) — Showing 5 Matches

🟢 Average Post #1 (Index 150658) | Score = 0.695
🏷️ Tags: ['conformal-field-theory', 'commutator', 'correlation-functions']
📝 In a Euclidean QFT, commutators of operators vanish for any spacetime separation. This can be argued very simply by using the path integral representation of the correlator, wherein operators become simple functions and hence can be easily moved around inside the integral. Now, in a 2d CFT the two point correlator of a primary operator  with conformal weights  and  looks like  where  is some normalizing constant. We can exchange  and  in the above formula by rotating  around  by :   where  is the spin of  and  depends on the choice of the branch cut for the power functions. Thus the commutator ...

🔹 Top 5 Hybrid Neighbors:
  1. final=0.695 | sim=0.626 | tag_score=0.800
     Tags: ['operators', 'conformal-field-theory', 'commutator', 'poisson-brackets', 'deformation-quantization']
     PredTa

# Top3 highest Score post

In [None]:
N = 10           # Total number of posts to process
batch_size = 2   # Process in batches of 100
top_scores = []

# Subset the first N posts
subset_df = df.head(N)

# Process in batches
for start in range(0, N, batch_size):
    end = min(start + batch_size, N)
    batch = subset_df.iloc[start:end]

    print(f"🚀 Processing batch {start} to {end - 1}...")

    for i, (idx, row) in enumerate(batch.iterrows(), start + 1):
        neighbors = recommend_hybrid(
            row['CleanBodyNoMath'], df, X, embed_model,
            classifiers, valid_tag_indices, mlb_valid,
            top_k_candidates=50, top_n=5, alpha=0.6
        )

        if len(neighbors) <= 1:
            continue  # No valid neighbors returned

        top_neighbor = neighbors[1]  # Skip self

        top_scores.append({
            'index': idx,
            'top_score': top_neighbor['final_score'],
            'post': row,
            'top_neighbor': top_neighbor
        })

# Sort by top neighbor score and pick top 3
top_scores = sorted(top_scores, key=lambda x: x['top_score'], reverse=True)[:3]

# Display results
for i, item in enumerate(top_scores, 1):
    idx = item['index']
    post = item['post']
    neighbor = item['top_neighbor']

    print(f"\n🔥 Post #{i} (Index {idx}) with highest neighbor final score:")
    print(f"  🏷️ Tags: {post['Tags']}")
    print(f"  📝 {post['CleanBodyNoMath'][:400].replace(chr(10),' ')}...\n")

    print(f"🔹 Top Neighbor (final={neighbor['final_score']:.3f} | sim={neighbor['sim']:.3f} | tag_score={neighbor['tag_score']:.3f})")
    print(f"   Tags: {neighbor['tags']}")
    print(f"   Predicted Tags: {neighbor['predicted_tags']}")
    print(f"   📝 {neighbor['text'][:400].replace(chr(10), ' ')}...\n")


🚀 Processing batch 0 to 1...
🚀 Processing batch 2 to 3...
🚀 Processing batch 4 to 5...
🚀 Processing batch 6 to 7...
🚀 Processing batch 8 to 9...

🔥 Post #1 (Index 7) with highest neighbor final score:
  🏷️ Tags: ['optics', 'visible-light', 'scattering', 'atmospheric-science', 'faq']
  📝 Why does the sky change color? Why is the sky blue during the day, red during sunrise/set and black during the night? ...

🔹 Top Neighbor (final=0.847 | sim=0.745 | tag_score=1.000)
   Tags: ['visible-light', 'scattering', 'atmospheric-science']
   Predicted Tags: ['atmospheric-science', 'astronomy', 'everyday-life', 'optics', 'visible-light']
   📝  Possible Duplicate: Why does the sky change color?   Basically what the title says. What mechanisms are significant and how do they contribute to make the sky blue. Also when the sky is not blue, like when the sun sets, how does it happen? ...


🔥 Post #2 (Index 0) with highest neighbor final score:
  🏷️ Tags: ['quantum-mechanics', 'particle-physics', 'angul

# Select 3 posts about specific physics topics


In [None]:
# Select 3 posts about specific physics topics
keywords = ['Newtonian-Mechanics']
mask = df['CleanBodyNoMath'].str.contains('|'.join(keywords), case=False, na=False)
topic_posts = df[mask].head(3)

for idx in topic_posts.index:
    post = df.loc[idx]
    print(f"\n🟢 Topic-Match Post (Index {idx}):")
    print(f"  🏷️ Tags: {post['Tags']}")
    print(f"  📝 {post['CleanBodyNoMath'][:400].replace(chr(10), ' ')}...\n")

    neighbors = recommend_hybrid(
        post['CleanBodyNoMath'], df, X, embed_model,
        classifiers, valid_tag_indices, mlb_valid,
        top_k_candidates=50, top_n=5, alpha=0.6
    )

    print("🔹 Top 5 Hybrid Neighbors:")
    for i, n in enumerate(neighbors[1:], 1):  # skip the post itself
        print(f"  {i}. final={n['final_score']:.3f} (sim={n['sim']:.3f}, tag_score={n['tag_score']:.3f})")
        print(f"     Tags: {n['tags']}")
        print(f"     PredTags: {n['predicted_tags']}")
        print(f"     📝 {n['text'][:400].replace(chr(10), ' ')}...\n")



🟢 Topic-Match Post (Index 23549):
  🏷️ Tags: ['quantum-mechanics', 'energy', 'momentum', 'operators']
  📝 It has been noted here, for instance, that  is true in all contexts. Likewise,  in notable contexts it is apparently true that  Is this, in a nutshell, a sufficient and valid justification for setting (in the corresponding suitable contexts) the momentum operator as  and setting the (potential) energy operator as  and both with the same constant of proportionality, , whereby  ? EDIT (related merel...

🔹 Top 5 Hybrid Neighbors:
  1. final=0.657 (sim=0.561, tag_score=0.800)
     Tags: ['quantum-mechanics', 'operators', 'differentiation', 'notation', 'calculus']
     PredTags: ['differentiation', 'quantum-mechanics', 'homework-and-exercises', 'momentum', 'operators']
     📝 By the product rule for differentiation: Where  is an operator and  is a function depend on  i.e. . My question is: when  takes the form of momentum operator: , it looks like the product rule for differentiation n

# Custom Query

In [None]:
# Define your custom query (e.g. a post you're composing or a complex question)
custom_query = "Explain the many worlds interpretation?"

print("🔍 Custom Query:")
print(custom_query.strip())

# Run hybrid recommendation against the full post dataset
neighbors = recommend_hybrid(
    query=custom_query,
    df=df,
    X=X,
    embed_model=embed_model,
    classifiers=classifiers,
    valid_tag_indices=valid_tag_indices,
    mlb_valid=mlb_valid,
    top_k_candidates=200,
    top_n=5,
    alpha=0.9
)

# Display top neighbors (excluding the query itself since it's external)
print("\n🔹 Top 5 Hybrid Neighbors:")
for i, n in enumerate(neighbors, 1):
    print(f"  {i}. final={n['final_score']:.3f} (sim={n['sim']:.3f}, tag_score={n['tag_score']:.3f})")
    print(f"     Tags: {n['tags']}")
    print(f"     PredTags: {n['predicted_tags']}")
    print(f"     📝 {n['text'][:400].replace(chr(10), ' ')}...\n")


🔍 Custom Query:
Explain the many worlds interpretation?

🔹 Top 5 Hybrid Neighbors:
  1. final=0.757 (sim=0.774, tag_score=0.600)
     Tags: ['quantum-mechanics', 'quantum-interpretations']
     PredTags: ['experimental-physics', 'quantum-mechanics', 'simulations', 'quantum-interpretations', 'string-theory']
     📝 Is the many-worlds interpretation just a different interpretation to quantum mechanics or does it contain some different predictions? In other words, is it possible theoretically to conduct an experiment that checks the many-worlds interpretation? ...

  2. final=0.750 (sim=0.767, tag_score=0.600)
     Tags: ['quantum-mechanics', 'quantum-interpretations']
     PredTags: ['quantum-mechanics', 'cosmology', 'universe', 'quantum-interpretations', 'multiverse']
     📝 I was reading this interpretation from this site, where these lines are noteworthy enough to talk for the fact that this interpretation doesn't actually talk about many-worlds:  These are the "many worlds" in questi