In [3]:
pip install transformers datasets torch sentence-transformers scikit-learn seqeval pandas numpy nltk fuzzywuzzy

Active code page: 1252
Collecting fuzzywuzzy
  Using cached fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Using cached fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
import json
import pandas as pd
from fuzzywuzzy import fuzz
from sentence_transformers import SentenceTransformer, util

# -------------------------------
# 1. Parse Functions
# -------------------------------
def parse_original_ann(ann_file):
    annotations = []
    with open(ann_file, 'r', encoding='utf-8') as f:
        for line in f:
            if not line.strip() or not line.startswith('T'):
                continue
            parts = line.strip().split('\t')
            if len(parts) == 3:
                ann_id, label_span, text = parts
                try:
                    label, start, end = label_span.split(' ', 2)
                    if ';' in end:
                        end = end.split(';')[0]
                    annotations.append({
                        'id': ann_id,
                        'label': label,
                        'start': int(start),
                        'end': int(end),
                        'text': text
                    })
                except ValueError:
                    continue
    return annotations


def parse_sct_ann(ann_file):
    annotations = []
    with open(ann_file, 'r', encoding='utf-8') as f:
        for line in f:
            if not line.startswith('TT'):
                continue
            try:
                parts = line.split('\t')
                info_components = parts[1].split('|')
                snomed_code = info_components[0].strip()
                snomed_text = info_components[1].strip()
                annotations.append({
                    'id': parts[0],
                    'label': 'SCT_Entity',
                    'snomed_code': snomed_code,
                    'snomed_text': snomed_text
                })
            except (IndexError, ValueError):
                continue
    return annotations


def build_combined_data(file_list, data_dir):
    combined_data = {}
    for txt_file in file_list:
        base = txt_file.replace('.txt', '')
        original_ann_file = os.path.join(data_dir, "original", base + '.ann')
        sct_ann_file = os.path.join(data_dir, "sct", base + '.ann')

        if os.path.exists(original_ann_file) and os.path.exists(sct_ann_file):
            original_annotations = parse_original_ann(original_ann_file)
            sct_annotations = parse_sct_ann(sct_ann_file)
            combined_data[base] = {
                'original': original_annotations,
                'sct': sct_annotations
            }
    return combined_data


# -------------------------------
# 2. Matching Functions
# -------------------------------
def match_with_fuzzywuzzy(adr_text, sct_annotations):
    best_match, max_score = None, -1
    for sct_ann in sct_annotations:
        score = fuzz.token_set_ratio(adr_text, sct_ann['snomed_text'])
        if score > max_score:
            max_score = score
            best_match = sct_ann
    return best_match, max_score


def match_with_embeddings(adr_text, sct_annotations, model):
    if not sct_annotations:
        return None, 0
    adr_embedding = model.encode(adr_text, convert_to_tensor=True)
    sct_texts = [s['snomed_text'] for s in sct_annotations]
    sct_embeddings = model.encode(sct_texts, convert_to_tensor=True)
    cosine_scores = util.cos_sim(adr_embedding, sct_embeddings)
    top_result = cosine_scores[0].argmax().item()
    return sct_annotations[top_result], cosine_scores[0][top_result].item()


# -------------------------------
# 3. Main Task 6
# -------------------------------
def main():
    DATA_DIR = r"C:\Users\satya\Downloads\Miimansa Problem\Assignment\data\CADEC.v2"
    FILE_LIST = r"C:\Users\satya\Downloads\Miimansa Problem\Assignment\outputs\task5\step5_sampled_files.txt"
    OUTPUT_DIR = r"C:\Users\satya\Downloads\Miimansa Problem\Assignment\outputs\task6"
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    print("📥 Loading sentence transformer model...")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    print("✅ Model loaded.")

    with open(FILE_LIST, 'r') as f:
        sampled_files = [line.strip() for line in f if line.strip()]

    data = build_combined_data(sampled_files, DATA_DIR)

    results = []
    print("\n--- Starting Task 6 Matching ---")
    for i, (filename, content) in enumerate(data.items()):
        original_adrs = [ann for ann in content['original'] if ann['label'] == 'ADR']
        if not original_adrs:
            continue

        print(f"\n--- Processing File: {filename} ({i+1}/{len(data)}) ---")
        for adr_ann in original_adrs:
            fuzzy_match, fuzzy_score = match_with_fuzzywuzzy(adr_ann['text'], content['sct'])
            embedding_match, embedding_score = match_with_embeddings(adr_ann['text'], content['sct'], model)

            res = {
                'file': filename,
                'original_text': adr_ann['text'],
                'fuzzy_match_text': fuzzy_match['snomed_text'] if fuzzy_match else 'N/A',
                'fuzzy_match_code': fuzzy_match['snomed_code'] if fuzzy_match else 'N/A',
                'fuzzy_score': fuzzy_score,
                'embedding_match_text': embedding_match['snomed_text'] if embedding_match else 'N/A',
                'embedding_match_code': embedding_match['snomed_code'] if embedding_match else 'N/A',
                'embedding_score': embedding_score
            }
            results.append(res)

            # Print live
            print(f"ADR: '{adr_ann['text']}'")
            print(f"  A) Fuzzy: '{res['fuzzy_match_text']}' (Code: {res['fuzzy_match_code']}) - {res['fuzzy_score']:.2f}")
            print(f"  B) Embed: '{res['embedding_match_text']}' (Code: {res['embedding_match_code']}) - {res['embedding_score']:.2f}")

    # Save outputs
    df = pd.DataFrame(results)
    df.to_csv(os.path.join(OUTPUT_DIR, "task6_matches.csv"), index=False)
    with open(os.path.join(OUTPUT_DIR, "task6_matches.json"), "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2)

    print(f"\n✅ Task 6 complete! Results saved to {OUTPUT_DIR}")


if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm


📥 Loading sentence transformer model...
✅ Model loaded.

--- Starting Task 6 Matching ---

--- Processing File: LIPITOR.667 (2/50) ---
ADR: 'Muscle pain in left elbow'
  A) Fuzzy: 'Pain in elbow' (Code: 74323005) - 100.00
  B) Embed: 'Pain in elbow' (Code: 74323005) - 0.83
ADR: 'pain in feet'
  A) Fuzzy: 'Pain in elbow' (Code: 74323005) - 74.00
  B) Embed: 'Foot pain' (Code: 47933007) - 0.92

--- Processing File: ARTHROTEC.57 (3/50) ---
ADR: 'spotting'
  A) Fuzzy: 'Menstrual spotting' (Code: 9126005) - 100.00
  B) Embed: 'Menstrual spotting' (Code: 9126005) - 0.74
ADR: 'spotting problems'
  A) Fuzzy: 'Menstrual spotting' (Code: 9126005) - 64.00
  B) Embed: 'Menstrual spotting' (Code: 9126005) - 0.76

--- Processing File: LIPITOR.512 (4/50) ---
ADR: 'neck pain'
  A) Fuzzy: 'Neck pain' (Code: 81680005) - 100.00
  B) Embed: 'Neck pain' (Code: 81680005) - 1.00
ADR: 'memory loss'
  A) Fuzzy: 'Mentally dull' (Code: 419723007) - 42.00
  B) Embed: 'Amnesia' (Code: 48167000) - 0.55
ADR: 'Brain 