In [None]:
"""
Topic Alignment using BERT + spaCy + Manual Annotations
======================================================

This script compares **Twitter API topics** with **TweetNLP topics** using:
    - Sentence-BERT embeddings (semantic similarity)
    - spaCy similarity scores (contextual overlap)
    - Manual annotation fallback

Decision rules:
---------------
1. If BERT == 1.0 OR spaCy == 1.0 → Mark as "BERT == 1" and mark all other topics as "High match already accepted".
2. Else, mark:
      - BERT ≥ 0.5 → "BERT ≥ 0.5"
      - spaCy ≥ 0.65 → "spaCy ≥ 0.65"
3. If at least one threshold match exists, all lower matches → "High match already accepted".
4. Otherwise → "Manual Annotation Required".

Output:
-------
Saves results into a CSV file: **topic_alignment_final.csv**
"""

# -----------------------
# INSTALLATION COMMANDS
# -----------------------
# Run these once before executing the script:
# pip install pandas sentence-transformers spacy
# python -m spacy download en_core_web_lg

import pandas as pd
from sentence_transformers import SentenceTransformer, util
import spacy

# -----------------------
# CONFIGURATION
# -----------------------

# Thresholds
BERT_THRESHOLD = 0.5
SPACY_THRESHOLD = 0.65

# Load Models
print("🔄 Loading models...")
bert_model = SentenceTransformer('all-MiniLM-L6-v2')
nlp_spacy = spacy.load("en_core_web_lg")

# Define TweetNLP topics
tweetnlp_topics = [
    'arts and culture', 'business and entrepreneurs', 'celebrity and pop culture',
    'diaries and daily life', 'family', 'fashion and style', 'film tv and video',
    'fitness and health', 'food and dining', 'gaming', 'learning and educational',
    'music', 'news and social concern', 'other hobbies', 'relationships',
    'science and technology', 'sports', 'travel and adventure', 'youth and student life'
]

# Example Twitter API topics (replace with your dataset)
twitter_api_topics = [
    "american football", "artificial intelligence", "arts and culture",
    "fashion and beauty", "designer fashion", "gaming", "technology", "MSFT"
]

# -----------------------
# PROCESSING LOGIC
# -----------------------

results = []

for api_topic in twitter_api_topics:
    scores = []

    # Step 1: Compute similarity scores
    for nlp_topic in tweetnlp_topics:
        bert_score = util.cos_sim(
            bert_model.encode(api_topic, convert_to_tensor=True),
            bert_model.encode(nlp_topic, convert_to_tensor=True)
        ).item()

        spacy_score = nlp_spacy(api_topic).similarity(nlp_spacy(nlp_topic))
        scores.append((nlp_topic, bert_score, spacy_score))

    # Sort results by BERT first, then spaCy score
    scores.sort(key=lambda x: (x[1], x[2]), reverse=True)

    # Step 2: Check for perfect match
    perfect_match = next(
        ((t, b, s) for t, b, s in scores if b == 1.0 or s == 1.0),
        None
    )

    if perfect_match:
        # Mark the perfect match row
        pm_topic, pm_bert, pm_spacy = perfect_match
        results.append({
            "Twitter API Topic": api_topic,
            "TweetNLP Topic": pm_topic,
            "BERT Similarity": round(pm_bert, 3),
            "spaCy Similarity": round(pm_spacy, 3),
            "Decision": "BERT == 1"
        })

        # Mark all others as high match already accepted
        for t, b, s in scores:
            if t != pm_topic:
                results.append({
                    "Twitter API Topic": api_topic,
                    "TweetNLP Topic": t,
                    "BERT Similarity": round(b, 3),
                    "spaCy Similarity": round(s, 3),
                    "Decision": "High match already accepted"
                })
        continue  # Skip threshold-based checks

    # Step 3: Apply thresholds if no perfect match exists
    accepted_matches = []
    for t, b, s in scores:
        if b >= BERT_THRESHOLD:
            decision = "BERT >= 0.5"
            accepted_matches.append(t)
        elif s >= SPACY_THRESHOLD:
            decision = "spaCy >= 0.65"
            accepted_matches.append(t)
        else:
            decision = "Manual Annotation Required"

        results.append({
            "Twitter API Topic": api_topic,
            "TweetNLP Topic": t,
            "BERT Similarity": round(b, 3),
            "spaCy Similarity": round(s, 3),
            "Decision": decision
        })

    # Step 4: If any threshold matches exist → mark all other lower-score rows
    if accepted_matches:
        for row in results:
            if (row["Twitter API Topic"] == api_topic and
                row["TweetNLP Topic"] not in accepted_matches and
                row["Decision"] == "Manual Annotation Required"):
                row["Decision"] = "High match already accepted"

# -----------------------
# SAVE RESULTS
# -----------------------

df = pd.DataFrame(results)
df = df.sort_values(by=["Twitter API Topic", "BERT Similarity"], ascending=[True, False])
df.to_csv("topic_alignment_final.csv", index=False)

print("\n Results saved to topic_alignment_final.csv")
