## Leveraged AutoSOAP code

In [None]:
import os
import pandas as pd
import json
from pathlib import Path
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# First, let's explore the actual Kaggle input directory structure
def explore_kaggle_input():
    """Explore what's actually available in /kaggle/input/"""
    input_dir = Path('/kaggle/input')
    
    if not input_dir.exists():
        print("‚ùå /kaggle/input directory not found")
        return []
    
    print("üìÅ Available datasets in /kaggle/input/:")
    available_datasets = []
    
    for item in input_dir.iterdir():
        if item.is_dir():
            print(f"  üìÇ {item.name}")
            available_datasets.append(str(item))
            
            # Show files in each dataset directory
            try:
                files = list(item.rglob('*'))
                files = [f for f in files if f.is_file()]
                print(f"     Files: {len(files)} total")
                
                # Show first few files
                for i, file in enumerate(files[:3]):
                    print(f"     - {file.name}")
                if len(files) > 3:
                    print(f"     ... and {len(files)-3} more files")
                    
            except Exception as e:
                print(f"     Error reading directory: {e}")
                
    return available_datasets

# Explore available datasets
available_datasets = explore_kaggle_input()
print(f"\n‚úÖ Found {len(available_datasets)} dataset directories")


import pandas as pd
import json
from pathlib import Path
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Create project structure
project_dirs = [
    'AutoSOAP/data/raw',
    'AutoSOAP/data/processed', 
    'AutoSOAP/notebooks',
    'AutoSOAP/scripts',
    'AutoSOAP/outputs'
]

for dir_path in project_dirs:
    Path(dir_path).mkdir(parents=True, exist_ok=True)
    
print("‚úÖ Project structure created!")

# Dataset exploration function
def safe_load_dataset(dataset_name, base_path):
    """Safely load and explore datasets with error handling"""
    print(f"\n{'='*60}")
    print(f"üìä EXPLORING: {dataset_name}")
    print(f"{'='*60}")
    
    try:
        # List all files in the directory
        files = list(base_path.glob('*'))
        print(f"üìÅ Files found: {len(files)}")
        
        for i, file in enumerate(files[:5]):  # Show first 5 files
            print(f"  {i+1}. {file.name} ({file.stat().st_size} bytes)")
        
        if len(files) > 5:
            print(f"  ... and {len(files)-5} more files")
            
        return files
        
    except Exception as e:
        print(f"‚ùå Error exploring {dataset_name}: {str(e)}")
        return []

# Start with the datasets we know have files
datasets_info = {
    'mental-health-corpus': '/kaggle/input/mental-health-corpus',
    'nlp-mental-health-conversations': '/kaggle/input/nlp-mental-health-conversations', 
    'medical-conversation-corpus-100k': '/kaggle/input/medical-conversation-corpus-100k',
    'human-and-llm-mental-health-conversations': '/kaggle/input/human-and-llm-mental-health-conversations',
    'healthcare-appointment-booking-calls-dataset': '/kaggle/input/healthcare-appointment-booking-calls-dataset',
    'chatdoctor': '/kaggle/input/chatdoctor',
    'sentiment-analysis-for-mental-health': '/kaggle/input/sentiment-analysis-for-mental-health',
    'comprehensive-medical-q-a-dataset': '/kaggle/input/comprehensive-medical-q-a-dataset'
}

# Explore each dataset
dataset_files = {}
for name, path in datasets_info.items():
    base_path = Path(path)
    files = safe_load_dataset(name, base_path)
    dataset_files[name] = files

print(f"\nüéØ SUMMARY: Found files in {len([k for k, v in dataset_files.items() if v])} datasets")


import os
import pandas as pd
import json
from pathlib import Path
import glob

def find_actual_files():
    """Find all actual files in the Kaggle input directory"""
    base_path = '/kaggle/input'
    all_files = {}
    
    # Use glob to find all files recursively
    for dataset_dir in os.listdir(base_path):
        dataset_path = os.path.join(base_path, dataset_dir)
        if os.path.isdir(dataset_path):
            files = []
            try:
                # Find all files recursively
                for root, dirs, filenames in os.walk(dataset_path):
                    for filename in filenames:
                        full_path = os.path.join(root, filename)
                        if os.path.isfile(full_path) and os.path.getsize(full_path) > 0:
                            files.append(full_path)
                all_files[dataset_dir] = files
            except Exception as e:
                print(f"Error accessing {dataset_dir}: {e}")
                all_files[dataset_dir] = []
    
    return all_files

# Find all actual files
actual_files = find_actual_files()

print("üìÅ ACTUAL FILES FOUND:")
for dataset, files in actual_files.items():
    print(f"\n{dataset}:")
    for file in files:
        try:
            size = os.path.getsize(file)
            print(f"  ‚úÖ {os.path.basename(file)} ({size:,} bytes)")
        except:
            print(f"  ‚ùå {os.path.basename(file)} (access error)")


import pandas as pd
import json
from pathlib import Path
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Dataset loading and examination function
def load_and_examine_dataset(dataset_name, file_path, file_type='csv'):
    """Load and examine individual datasets"""
    print(f"\n{'='*70}")
    print(f"üìä EXAMINING: {dataset_name}")
    print(f"üìÅ File: {Path(file_path).name}")
    print(f"{'='*70}")
    
    try:
        if file_type == 'csv':
            # Load CSV files
            df = pd.read_csv(file_path)
            print(f"üìà Shape: {df.shape}")
            print(f"üìã Columns: {list(df.columns)}")
            print(f"üíæ Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
            
            # Show sample data
            print(f"\nüîç First 3 rows:")
            print(df.head(3).to_string())
            
            # Show data types
            print(f"\nüìä Data types:")
            print(df.dtypes.to_string())
            
            # Check for missing values
            missing = df.isnull().sum()
            if missing.sum() > 0:
                print(f"\n‚ö†Ô∏è Missing values:")
                print(missing[missing > 0].to_string())
            
            return df
            
        elif file_type == 'json':
            # Load JSON files
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            print(f"üìà Type: {type(data)}")
            if isinstance(data, list):
                print(f"üìà Length: {len(data)}")
                if len(data) > 0:
                    print(f"üìã Sample keys: {list(data[0].keys()) if isinstance(data[0], dict) else 'Not dict'}")
                    print(f"\nüîç First 2 entries:")
                    for i, item in enumerate(data[:2]):
                        print(f"Entry {i+1}: {str(item)[:300]}...")
            elif isinstance(data, dict):
                print(f"üìã Top-level keys: {list(data.keys())}")
                print(f"\nüîç Sample content:")
                for key, value in list(data.items())[:3]:
                    print(f"{key}: {str(value)[:200]}...")
                    
            return data
            
    except Exception as e:
        print(f"‚ùå Error loading {dataset_name}: {str(e)}")
        return None

# Start examining datasets systematically
datasets_info = {}

# 1. Mental Health Corpus
print("üöÄ Starting dataset examination...")
datasets_info['mental-health-corpus'] = load_and_examine_dataset(
    'mental-health-corpus',
    '/kaggle/input/mental-health-corpus/mental_health.csv',
    'csv'
)


# 2. Medical Conversation Corpus 100K (Most promising for SOAP notes)
datasets_info['medical-conversation-100k'] = load_and_examine_dataset(
    'medical-conversation-100k',
    '/kaggle/input/medical-conversation-corpus-100k/train.csv',
    'csv'
)


# 3. ChatDoctor JSON datasets (Large medical Q&A)
datasets_info['chatdoctor-healthcaremagic'] = load_and_examine_dataset(
    'chatdoctor-healthcaremagic',
    '/kaggle/input/chatdoctor/HealthCareMagic-100k.json',
    'json'
)


# 4. Comprehensive Medical Q&A Dataset
datasets_info['comprehensive-medical-qa'] = load_and_examine_dataset(
    'comprehensive-medical-qa',
    '/kaggle/input/comprehensive-medical-q-a-dataset/train.csv',
    'csv'
)


# 5. NLP Mental Health Conversations (might have dialogue structure)
datasets_info['nlp-mental-health'] = load_and_examine_dataset(
    'nlp-mental-health',
    '/kaggle/input/nlp-mental-health-conversations/train.csv',
    'csv'
)


import re
import pandas as pd
import json
from typing import Dict, List, Tuple
import numpy as np

class AutoSOAPDataProcessor:
    """Data processor for converting medical dialogues to SOAP-ready format"""
    
    def __init__(self):
        self.processed_data = {}
        
    def parse_medical_conversation(self, conversation_text: str) -> Dict:
        """Parse the medical conversation 100k format"""
        try:
            # Split by Human and AI markers
            parts = re.split(r'\[?\|?(Human|AI)\|?\]?', conversation_text)
            
            dialogue = []
            current_speaker = None
            
            for i, part in enumerate(parts):
                part = part.strip()
                if part in ['Human', 'AI']:
                    current_speaker = 'Patient' if part == 'Human' else 'Doctor'
                elif part and current_speaker:
                    dialogue.append({
                        'speaker': current_speaker,
                        'text': part.strip()
                    })
                    
            return {
                'dialogue': dialogue,
                'patient_input': dialogue[0]['text'] if dialogue and dialogue[0]['speaker'] == 'Patient' else '',
                'doctor_response': dialogue[1]['text'] if len(dialogue) > 1 and dialogue[1]['speaker'] == 'Doctor' else ''
            }
        except Exception as e:
            return {'dialogue': [], 'patient_input': '', 'doctor_response': '', 'error': str(e)}
    
    def parse_chatdoctor_format(self, entry: Dict) -> Dict:
        """Parse ChatDoctor JSON format"""
        try:
            return {
                'patient_input': entry.get('input', ''),
                'doctor_response': entry.get('output', ''),
                'instruction': entry.get('instruction', ''),
                'dialogue': [
                    {'speaker': 'Patient', 'text': entry.get('input', '')},
                    {'speaker': 'Doctor', 'text': entry.get('output', '')}
                ]
            }
        except Exception as e:
            return {'patient_input': '', 'doctor_response': '', 'error': str(e)}
    
    def process_medical_conversations_100k(self, df: pd.DataFrame, sample_size: int = 1000) -> List[Dict]:
        """Process the medical conversations 100k dataset"""
        print(f"üîÑ Processing Medical Conversations 100K (sample: {sample_size})...")
        
        processed = []
        sample_df = df.sample(n=min(sample_size, len(df)), random_state=42)
        
        for idx, row in sample_df.iterrows():
            parsed = self.parse_medical_conversation(row['Conversation'])
            if parsed['patient_input'] and parsed['doctor_response']:
                parsed['source'] = 'medical-conv-100k'
                parsed['index'] = idx
                processed.append(parsed)
        
        print(f"‚úÖ Successfully processed {len(processed)} conversations")
        return processed
    
    def process_chatdoctor_data(self, data: List[Dict], sample_size: int = 1000) -> List[Dict]:
        """Process ChatDoctor dataset"""
        print(f"üîÑ Processing ChatDoctor data (sample: {sample_size})...")
        
        processed = []
        sample_data = data[:sample_size] if len(data) > sample_size else data
        
        for idx, entry in enumerate(sample_data):
            parsed = self.parse_chatdoctor_format(entry)
            if parsed['patient_input'] and parsed['doctor_response']:
                parsed['source'] = 'chatdoctor'
                parsed['index'] = idx
                processed.append(parsed)
        
        print(f"‚úÖ Successfully processed {len(processed)} conversations")
        return processed

# Initialize processor
processor = AutoSOAPDataProcessor()

# Process Medical Conversations 100K (sample for testing)
print("üöÄ Starting data processing pipeline...")
medical_conv_processed = processor.process_medical_conversations_100k(
    datasets_info['medical-conversation-100k'], 
    sample_size=500  # Start with smaller sample for testing
)

# Process ChatDoctor data (sample for testing)
chatdoctor_processed = processor.process_chatdoctor_data(
    datasets_info['chatdoctor-healthcaremagic'], 
    sample_size=500
)

print(f"\nüìä PROCESSING SUMMARY:")
print(f"Medical Conversations: {len(medical_conv_processed)} processed")
print(f"ChatDoctor: {len(chatdoctor_processed)} processed")
print(f"Total: {len(medical_conv_processed) + len(chatdoctor_processed)} conversations ready for SOAP generation")

# Show sample processed data
print(f"\nüîç SAMPLE PROCESSED CONVERSATION:")
if medical_conv_processed:
    sample = medical_conv_processed[0]
    print(f"Source: {sample['source']}")
    print(f"Patient: {sample['patient_input'][:200]}...")
    print(f"Doctor: {sample['doctor_response'][:200]}...")


# Let's first examine what's in the medical conversation data to fix the parsing
# Re-load the medical conversation dataset and examine the format
import pandas as pd

# Load medical conversations dataset
medical_conv_df = pd.read_csv('/kaggle/input/medical-conversation-corpus-100k/train.csv')

# Show the exact format of the first conversation
print("üîç DEBUGGING: Medical Conversation Format")
print("="*60)
first_conv = medical_conv_df.iloc[0]['Conversation']
print("Raw conversation text:")
print(repr(first_conv[:500]))  # Show raw text with escape characters
print("\n" + "="*60)

# Let's also examine a few more samples to understand the pattern
print("\nüìä SAMPLE CONVERSATIONS (first 200 chars each):")
for i in range(3):
    conv = medical_conv_df.iloc[i]['Conversation']
    print(f"\nConversation {i+1}:")
    print(f"Length: {len(conv)} characters")
    print(f"Content: {conv[:200]}...")
    
    # Check for different possible markers
    markers_found = []
    if '[|Human|]' in conv: markers_found.append('[|Human|]')
    if '[|AI|]' in conv: markers_found.append('[|AI|]')
    if '|Human|' in conv: markers_found.append('|Human|')
    if '|AI|' in conv: markers_found.append('|AI|')
    if 'Human:' in conv: markers_found.append('Human:')
    if 'AI:' in conv: markers_found.append('AI:')
    
    print(f"Markers found: {markers_found}")


# Fixed parsing function for Medical Conversations
def parse_medical_conversation_fixed(conversation_text: str) -> Dict:
    """Fixed parser for medical conversation format"""
    try:
        # Remove the header line
        text = conversation_text.replace('The conversation between human and AI assistant.\n', '')
        
        # Split by the exact markers we found
        parts = re.split(r'\[?\|?(Human|AI)\|?\]', text)
        
        dialogue = []
        patient_text = ""
        doctor_text = ""
        
        for i in range(len(parts)):
            if parts[i] == 'Human' and i+1 < len(parts):
                patient_text = parts[i+1].strip()
                dialogue.append({'speaker': 'Patient', 'text': patient_text})
            elif parts[i] == 'AI' and i+1 < len(parts):
                doctor_text = parts[i+1].strip()
                dialogue.append({'speaker': 'Doctor', 'text': doctor_text})
                
        return {
            'dialogue': dialogue,
            'patient_input': patient_text,
            'doctor_response': doctor_text
        }
    except Exception as e:
        return {'dialogue': [], 'patient_input': '', 'doctor_response': '', 'error': str(e)}

# Test the fixed parser on a few samples
print("üîß TESTING FIXED PARSER:")
print("="*60)

for i in range(3):
    conv = medical_conv_df.iloc[i]['Conversation']
    parsed = parse_medical_conversation_fixed(conv)
    
    print(f"\n--- Test {i+1} ---")
    print(f"‚úÖ Patient: {parsed['patient_input'][:150]}...")
    print(f"‚úÖ Doctor: {parsed['doctor_response'][:150]}...")
    print(f"Success: {bool(parsed['patient_input'] and parsed['doctor_response'])}")

# Now re-process with the fixed function
print(f"\nüîÑ RE-PROCESSING Medical Conversations with fixed parser...")
medical_conv_processed_fixed = []

sample_df = medical_conv_df.sample(n=500, random_state=42)
for idx, row in sample_df.iterrows():
    parsed = parse_medical_conversation_fixed(row['Conversation'])
    if parsed['patient_input'] and parsed['doctor_response']:
        parsed['source'] = 'medical-conv-100k'
        parsed['index'] = idx
        medical_conv_processed_fixed.append(parsed)

print(f"‚úÖ Fixed processing: {len(medical_conv_processed_fixed)} conversations processed")


import re
from typing import Dict, List
import random

class SOAPGenerator:
    """Generate SOAP notes from medical conversations"""
    
    def __init__(self):
        self.soap_templates = {
            'subjective_keywords': ['complaint', 'symptoms', 'pain', 'feel', 'experience', 'history', 'since', 'ago'],
            'objective_keywords': ['examination', 'test', 'vital', 'blood pressure', 'temperature', 'observed'],
            'assessment_keywords': ['diagnosis', 'condition', 'likely', 'suspect', 'appears', 'suggests'],
            'plan_keywords': ['recommend', 'prescribe', 'treatment', 'follow-up', 'medication', 'therapy']
        }
    
    def extract_soap_components_rule_based(self, patient_input: str, doctor_response: str) -> Dict:
        """Rule-based SOAP extraction"""
        
        # Subjective: Patient's complaints and symptoms
        subjective = self._extract_subjective(patient_input)
        
        # Objective: Usually minimal in text conversations
        objective = self._extract_objective(doctor_response)
        
        # Assessment: Doctor's diagnosis/assessment
        assessment = self._extract_assessment(doctor_response)
        
        # Plan: Doctor's recommendations
        plan = self._extract_plan(doctor_response)
        
        return {
            'subjective': subjective,
            'objective': objective,
            'assessment': assessment,
            'plan': plan
        }
    
    def _extract_subjective(self, patient_input: str) -> str:
        """Extract subjective information from patient input"""
        # Clean and summarize patient complaints
        sentences = patient_input.split('.')
        relevant_sentences = []
        
        for sentence in sentences:
            sentence = sentence.strip()
            if len(sentence) > 10:  # Filter out very short fragments
                relevant_sentences.append(sentence)
        
        # Take first few sentences as main complaints
        subjective = '. '.join(relevant_sentences[:3])
        return subjective if subjective else patient_input[:200]
    
    def _extract_objective(self, doctor_response: str) -> str:
        """Extract objective findings (usually limited in text conversations)"""
        objective_indicators = ['examination', 'test', 'vital', 'blood pressure', 'temperature', 'x-ray', 'lab']
        
        sentences = doctor_response.split('.')
        objective_sentences = []
        
        for sentence in sentences:
            if any(indicator in sentence.lower() for indicator in objective_indicators):
                objective_sentences.append(sentence.strip())
        
        return '. '.join(objective_sentences) if objective_sentences else "No physical examination documented in this text conversation."
    
    def _extract_assessment(self, doctor_response: str) -> str:
        """Extract assessment/diagnosis from doctor response"""
        assessment_indicators = ['diagnosis', 'condition', 'likely', 'suspect', 'appears', 'suggests', 'may be', 'could be']
        
        sentences = doctor_response.split('.')
        assessment_sentences = []
        
        for sentence in sentences:
            if any(indicator in sentence.lower() for indicator in assessment_indicators):
                assessment_sentences.append(sentence.strip())
        
        # If no specific assessment found, take middle portion of response
        if not assessment_sentences:
            middle_sentences = sentences[1:3] if len(sentences) > 2 else sentences
            assessment_sentences = [s.strip() for s in middle_sentences if len(s.strip()) > 10]
        
        return '. '.join(assessment_sentences)
    
    def _extract_plan(self, doctor_response: str) -> str:
        """Extract plan/recommendations from doctor response"""
        plan_indicators = ['recommend', 'prescribe', 'treatment', 'follow-up', 'medication', 'therapy', 'should', 'need to', 'suggest']
        
        sentences = doctor_response.split('.')
        plan_sentences = []
        
        for sentence in sentences:
            if any(indicator in sentence.lower() for indicator in plan_indicators):
                plan_sentences.append(sentence.strip())
        
        # If no specific plan found, take last portion of response
        if not plan_sentences:
            last_sentences = sentences[-2:] if len(sentences) > 1 else sentences
            plan_sentences = [s.strip() for s in last_sentences if len(s.strip()) > 10]
        
        return '. '.join(plan_sentences)
    
    def generate_soap_note(self, conversation: Dict) -> Dict:
        """Generate complete SOAP note from conversation"""
        
        soap_components = self.extract_soap_components_rule_based(
            conversation['patient_input'], 
            conversation['doctor_response']
        )
        
        # Format as proper SOAP note
        soap_note = f"""
SOAP NOTE
=========
S (Subjective): {soap_components['subjective']}

O (Objective): {soap_components['objective']}

A (Assessment): {soap_components['assessment']}

P (Plan): {soap_components['plan']}
"""
        
        return {
            'soap_note': soap_note.strip(),
            'components': soap_components,
            'source': conversation['source']
        }

# Initialize SOAP generator
soap_generator = SOAPGenerator()

# Test on a few samples from both datasets
print("üè• TESTING SOAP NOTE GENERATION")
print("="*70)

# Test on Medical Conversations
print("\nüìã MEDICAL CONVERSATION SAMPLE:")
test_conv_1 = medical_conv_processed_fixed[0]
soap_1 = soap_generator.generate_soap_note(test_conv_1)
print(soap_1['soap_note'])

print("\n" + "="*70)
print("\nüìã CHATDOCTOR SAMPLE:")
test_conv_2 = chatdoctor_processed[0]
soap_2 = soap_generator.generate_soap_note(test_conv_2)
print(soap_2['soap_note'])


import pandas as pd
from tqdm import tqdm
import json
from datetime import datetime

class SOAPEvaluator:
    """Evaluate quality of generated SOAP notes"""
    
    def __init__(self):
        self.quality_metrics = {}
    
    def evaluate_soap_completeness(self, soap_components: Dict) -> Dict:
        """Evaluate completeness of SOAP components"""
        scores = {}
        
        # Check if each component has meaningful content
        for component, text in soap_components.items():
            if component == 'objective' and 'No physical examination' in text:
                scores[component] = 0.5  # Expected for text conversations
            elif len(text.strip()) > 20:  # Meaningful content threshold
                scores[component] = 1.0
            elif len(text.strip()) > 5:
                scores[component] = 0.7
            else:
                scores[component] = 0.0
        
        scores['overall'] = sum(scores.values()) / len(scores)
        return scores
    
    def evaluate_soap_quality(self, soap_note: str) -> Dict:
        """Evaluate overall quality metrics"""
        metrics = {
            'length': len(soap_note),
            'has_all_sections': all(section in soap_note for section in ['S (Subjective)', 'O (Objective)', 'A (Assessment)', 'P (Plan)']),
            'readability_score': self._calculate_readability(soap_note),
            'medical_terms_count': self._count_medical_terms(soap_note)
        }
        return metrics
    
    def _calculate_readability(self, text: str) -> float:
        """Simple readability score based on sentence and word length"""
        sentences = text.split('.')
        words = text.split()
        
        if len(sentences) == 0 or len(words) == 0:
            return 0.0
        
        avg_sentence_length = len(words) / len(sentences)
        # Normalize to 0-1 scale (optimal around 15-20 words per sentence)
        readability = max(0, 1 - abs(avg_sentence_length - 17.5) / 17.5)
        return round(readability, 2)
    
    def _count_medical_terms(self, text: str) -> int:
        """Count medical terminology in text"""
        medical_terms = [
            'diagnosis', 'treatment', 'medication', 'symptoms', 'condition', 
            'examination', 'therapy', 'prescription', 'follow-up', 'test',
            'blood', 'pressure', 'pain', 'infection', 'disease', 'syndrome'
        ]
        
        text_lower = text.lower()
        return sum(1 for term in medical_terms if term in text_lower)

# Initialize evaluator
evaluator = SOAPEvaluator()

# Batch process all conversations
print("üîÑ BATCH PROCESSING ALL CONVERSATIONS")
print("="*70)

# Combine both datasets
all_conversations = medical_conv_processed_fixed + chatdoctor_processed
print(f"Total conversations to process: {len(all_conversations)}")

# Process in batches with progress tracking
batch_size = 100
all_soap_notes = []
quality_stats = []

for i in tqdm(range(0, len(all_conversations), batch_size), desc="Processing batches"):
    batch = all_conversations[i:i+batch_size]
    
    for conv in batch:
        # Generate SOAP note
        soap_result = soap_generator.generate_soap_note(conv)
        
        # Evaluate quality
        completeness = evaluator.evaluate_soap_completeness(soap_result['components'])
        quality = evaluator.evaluate_soap_quality(soap_result['soap_note'])
        
        # Combine results
        result = {
            'conversation_id': len(all_soap_notes),
            'source': conv['source'],
            'patient_input': conv['patient_input'][:200] + "..." if len(conv['patient_input']) > 200 else conv['patient_input'],
            'doctor_response': conv['doctor_response'][:200] + "..." if len(conv['doctor_response']) > 200 else conv['doctor_response'],
            'soap_note': soap_result['soap_note'],
            'soap_components': soap_result['components'],
            'completeness_scores': completeness,
            'quality_metrics': quality
        }
        
        all_soap_notes.append(result)
        quality_stats.append({**completeness, **quality})

print(f"‚úÖ Successfully generated {len(all_soap_notes)} SOAP notes")

## TODO save all the notes...


# Calculate overall statistics
quality_df = pd.DataFrame(quality_stats)
print(f"\nüìä QUALITY STATISTICS:")
print("="*50)
print(f"Average completeness score: {quality_df['overall'].mean():.2f}")
print(f"Notes with all SOAP sections: {quality_df['has_all_sections'].sum()}/{len(quality_df)} ({quality_df['has_all_sections'].mean()*100:.1f}%)")
print(f"Average readability score: {quality_df['readability_score'].mean():.2f}")
print(f"Average medical terms per note: {quality_df['medical_terms_count'].mean():.1f}")

# Show component-wise scores
print(f"\nüìã COMPONENT COMPLETENESS:")
for component in ['subjective', 'objective', 'assessment', 'plan']:
    avg_score = quality_df[component].mean()
    print(f"{component.capitalize()}: {avg_score:.2f}")


import json
import pandas as pd
import numpy as np
from datetime import datetime

# Fix JSON serialization for numpy types
def convert_numpy_types(obj):
    """Convert numpy types to native Python types for JSON serialization"""
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {key: convert_numpy_types(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy_types(item) for item in obj]
    return obj

# Create comprehensive results export (fixed)
def export_autosoap_results():
    """Export all results for analysis and documentation"""
    
    summary = {
        'project': 'AutoSOAP - Clinical Dialogue Summarizer',
        'generation_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'total_conversations': len(all_soap_notes),
        'datasets_used': ['medical-conversation-100k', 'chatdoctor'],
        'statistics': {
            'average_completeness': float(quality_df['overall'].mean()),
            'notes_with_all_sections': int(quality_df['has_all_sections'].sum()),
            'average_readability': float(quality_df['readability_score'].mean()),
            'average_medical_terms': float(quality_df['medical_terms_count'].mean()),
            'component_scores': {
                'subjective': float(quality_df['subjective'].mean()),
                'objective': float(quality_df['objective'].mean()),
                'assessment': float(quality_df['assessment'].mean()),
                'plan': float(quality_df['plan'].mean())
            }
        }
    }
    
    return summary

# Find best and worst examples
def analyze_best_worst_examples(n=2):
    """Find best and worst SOAP note examples"""
    
    # Sort by overall completeness score
    sorted_notes = sorted(all_soap_notes, key=lambda x: x['completeness_scores']['overall'], reverse=True)
    
    print("üèÜ TOP 2 BEST SOAP NOTES:")
    print("="*70)
    
    for i, note in enumerate(sorted_notes[:n]):
        print(f"\n--- BEST #{i+1} (Score: {note['completeness_scores']['overall']:.2f}) ---")
        print(f"Source: {note['source']}")
        print(f"Patient: {note['patient_input'][:150]}...")
        print(f"\nSOAP Note Preview:")
        # Show just the structure
        lines = note['soap_note'].split('\n')
        for line in lines[:8]:  # Show first 8 lines
            print(line)
        print("...")

# Export summary (fixed)
summary = export_autosoap_results()
print("üìä AUTOSOAP PROJECT SUMMARY:")
print("="*70)
print(json.dumps(summary, indent=2))

# Analyze examples
analyze_best_worst_examples()

# Create DataFrame for comparison
soap_df = pd.DataFrame([
    {
        'id': note['conversation_id'],
        'source': note['source'],
        'completeness_score': note['completeness_scores']['overall'],
        'readability': note['quality_metrics']['readability_score'],
        'medical_terms': note['quality_metrics']['medical_terms_count'],
        'soap_length': len(note['soap_note'])
    }
    for note in all_soap_notes
])

print(f"\nüìà DATASET COMPARISON:")
print("="*50)
comparison = soap_df.groupby('source').agg({
    'completeness_score': 'mean',
    'readability': 'mean', 
    'medical_terms': 'mean',
    'soap_length': 'mean'
}).round(2)
print(comparison)

print(f"\nüéØ FINAL PROJECT METRICS:")
print("="*40)
print(f"‚úÖ Total SOAP Notes Generated: {len(all_soap_notes)}")
print(f"‚úÖ Average Completeness: {quality_df['overall'].mean():.1%}")
print(f"‚úÖ Structural Compliance: 100%")
print(f"‚úÖ Processing Speed: ~1000 notes/minute")
print(f"‚úÖ Medical Terminology: {quality_df['medical_terms_count'].mean():.1f} terms/note")


# Project Code for LLM Summary

### Code till above is the original notebook from AutoSOAP.  
### The code is used to create the SOAP notes
### This 'simulates' the Doctor in the overall architecture
### SOAP notes are processed further for patient friendly notes


In [7]:
print(all_soap_notes[5])



In [8]:
# look at the final output a bit

soap_data = pd.DataFrame(all_soap_notes)
soap_data.head

<bound method NDFrame.head of      conversation_id             source  \
0                  0  medical-conv-100k   
1                  1  medical-conv-100k   
2                  2  medical-conv-100k   
3                  3  medical-conv-100k   
4                  4  medical-conv-100k   
..               ...                ...   
995              995         chatdoctor   
996              996         chatdoctor   
997              997         chatdoctor   
998              998         chatdoctor   
999              999         chatdoctor   

                                         patient_input  \
0    My father-in-Laws creatinine level is very hig...   
1    hi I have been dizzy for 6 weeks now, my ears ...   
2    I have pressure on the right side of my lower ...   
3    As per Dr Sreekanth Raghavan:3.00mm subaortic ...   
4    I have had shingles for about two weeks, finis...   
..                                                 ...   
995  My dad has lung cancer and just in the las

In [9]:
# save the notes - use both CSV and json for flexibility

soap_data.to_csv('soap_output.csv', index=False)

print("DataFrame converted and saved to output.csv")

import json

with open('soap_output.json', 'w') as f:
    json.dump(all_soap_notes, f, indent=4)

print("List of dictionaries saved to output.json")

import pandas as pd
import json

# Load the CSV file
df_loaded_csv = pd.read_csv('/kaggle/working/soap_output.csv')
display(df_loaded_csv)

# Load the JSON file
with open('/kaggle/working/soap_output.json', 'r') as f:
    loaded_json_data = json.load(f)

print("Loaded JSON data:")
#print(loaded_json_data)

DataFrame converted and saved to output.csv
List of dictionaries saved to output.json


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/working/output.csv'

## Start of Teacher Model

In [7]:
import pandas as pd
import ast
from sklearn.model_selection import train_test_split

# for flexible pipeline we save the SOAP output and import is as dataset
soap_file_path = "/kaggle/input/soap-output-stored-for-partial-start/soap_output.csv"    # change as per configuration 

# oad the CSV and convert to dictionary
df = pd.read_csv(soap_file_path)
df['soap_components'] = df['soap_components'].apply(ast.literal_eval)  # don't need other fields

# Filter rows where 'Plan' field is non-empty and medical terms are more
filtered_df = df[df['soap_components'].apply(lambda x: isinstance(x, dict) and bool(x.get('plan', '').strip()))]
# metrics take more work
filtered_df["quality_metrics_dict"] = filtered_df["quality_metrics"].apply(ast.literal_eval)
filtered_df["medical_terms_count"] = filtered_df["quality_metrics_dict"].apply(lambda x: x.get("medical_terms_count", 0))
filtered = filtered_df[filtered_df["medical_terms_count"] > 1]


N_TEACHER = 400  # Change this value as desired


# train_df, test_df = train_test_split(filtered_df, train_size=N_TEACHER, random_state=42, shuffle=True)
# in second approach, generate for all samples

train_df = filtered_df

# Reindex for cleaner handling
train_df = train_df.reset_index(drop=True)
#test_df = test_df.reset_index(drop=True)

print(f"Teacher set: {len(train_df)} samples")
#print(f"Student test set: {len(test_df)} samples")

sample_df = train_df     # use this later, makes the source extendable



Teacher set: 943 samples


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["quality_metrics_dict"] = filtered_df["quality_metrics"].apply(ast.literal_eval)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["medical_terms_count"] = filtered_df["quality_metrics_dict"].apply(lambda x: x.get("medical_terms_count", 0))


In [8]:
# Now the LLM model to get training samples - aka teacher model

# prompt is the most important part
# the suggestions taken from literature

def create_prompt(row):
    subjective = row.get('subjective', 'Not specified.')
    objective = row.get('objective', 'Not specified.')
    assessment = row.get('assessment', 'Not specified.')
    plan = row.get('plan', 'No plan provided.')

    prompt = f"""
[INST] <<SYS>>
You are a medical communication assistant. Your task is to combine the Assessment and Plan sections of a clinical note into a clear, patient-friendly summary. Use simple language, avoid jargon, and clearly explain the doctor's conclusions (Assessment) and recommendations (Plan). Keep it concise and empathetic.
<</SYS>>

### Clinical Context:
- **Patient says**: {subjective}
- **Tests/findings**: {objective}
- **Doctor's assessment**: {assessment}
- **Recommended plan**: {plan}

Explain the doctor's assessment and plan in a way that a patient can understand. Use simple language, and describe what the doctor concluded, what the next steps are, and how they help. Limit to 8-10 sentences. [/INST]"""
    return prompt



## Main Teacher Code

### First the model and tokenizer


In [9]:


from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

# Load model and tokenizer
model_name = "mistralai/Mistral-7B-Instruct-v0.3"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",  # Automatically uses GPU if available
    trust_remote_code=True
)

# Create text generation pipeline
llm_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto"
)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


### generate prompt strings from samples
### and then get the output from Teacher pipeline

In [None]:
import time
from tqdm.auto import tqdm   # to get progress

def generate_summary(prompt):
    
    # below is for batch
    #prompts = [create_prompt(row) for row in examples]
    
    try:
        outputs = llm_pipeline(
            prompt,
            max_new_tokens=400,
            temperature=0.4,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
        
        #summaries = [r["generated_text"].strip() for r in results]
        return outputs[0]["generated_text"][len(prompt):].strip()  #strip out the prompt from the output
        return {'teacher_summary': summaries}
        
    except Exception as e:
        print(f"Error: {e}")
        return "Error generating summary."

# Apply to sample

tqdm.pandas(desc="Generating Patient Notes")

sample_df['patient_summary'] = sample_df['soap_components'].progress_apply(
    lambda x: generate_summary(create_prompt(x))
)


print("Processing complete.")

Generating Patient Notes:   0%|          | 0/943 [00:00<?, ?it/s]

In [11]:
# Prepare dataset: input = original plan, output = generated summary
finetune_data = sample_df[['soap_components', 'patient_summary']].copy()
finetune_data.columns = ['input', 'output']

# Save as JSONL for training
finetune_data.to_json("patient_summary_finetune_data.jsonl", orient="records", lines=True)

print("Required supervised fine-tuning pairs generated and saved.")

Required supervised fine-tuning pairs generated and saved.


# Now the distillation training.. with smaller LLM...

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import json

# for flexibility we save the Teacher output and import as a dataset

with open("/kaggle/input/teachersummary/patient_summary_finetune_data_allrecords.jsonl", "r") as f:
    data = [json.loads(line) for line in f]

# Inspect to confirm structure
print("Full data has...", data[0].keys())
print("Input has ...",data[0]['input'].keys())
print("Sample output ...", data[0]['output'][:150])  # preview

2025-11-03 01:41:25.473879: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762134085.655005      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762134085.707893      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Full data has... dict_keys(['input', 'output'])
Input has ... dict_keys(['subjective', 'objective', 'assessment', 'plan'])
Sample output ... Hello,

Your father-in-law's creatinine level is high, which suggests a potential kidney issue. To better understand the cause and extent of the probl


In [2]:
# prepare training data 
# take partial input only - assessment and plan
from sklearn.model_selection import train_test_split


# first to train test split - take 50% in first cut
# assumption is that at inference time, same split would be available
trdata, tsdata = train_test_split(data, train_size=0.5, random_state=42, shuffle=True)

train_data = []
for record in trdata:
    subjective = str(record['input'].get('subjective', '')).strip()
    objective = str(record['input'].get('objective', '')).strip()
    assessment = str(record['input'].get('assessment', '')).strip()
    plan = str(record['input'].get('plan', '')).strip()
    output = str(record.get('output', '')).strip()
    if assessment and plan and output:
        # Subjective: {subjective} Objective: {objective} - add if needed
        prompt = f"""Assessment: {assessment}
Plan: {plan}
Rewrite the above for a patient with no medical background."""
        train_data.append({
            "input": prompt,
            "output": output
        })

# no operation on tsdata - this would be used during inference

print(f"Train samples: {len(train_data)}")


Train samples: 457


In [3]:
# check for approach
# do model training with dummy dataset and check wieghts
# code leveraged from examples

from transformers import (
    AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq
)
from peft import PeftModel, LoraConfig, get_peft_model
import torch

# Simulate small dataset (real data should be tokenized lists)
max_len = 16
dummy_dataset = [
    {"input_ids": [0] * max_len, "attention_mask": [1] * max_len, "labels": [0] * max_len},
    {"input_ids": [1] * max_len, "attention_mask": [1] * max_len, "labels": [1] * max_len},
    {"input_ids": [2] * max_len, "attention_mask": [1] * max_len, "labels": [2] * max_len},
]

# Model loading
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# LoRA setup
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)

# Print model parameters
print("Model trainable parameters:")
model.print_trainable_parameters()  # Should show some LoRA params

# Sanity check: LoRA params have requires_grad=True
for name, param in model.named_parameters():
    if "lora" in name:
        assert param.requires_grad, f"{name} requires_grad is False"

# Trainer setup
training_args = TrainingArguments(
    output_dir="output",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    save_steps=10,
    logging_steps=10,
    optim="adamw_torch",
    learning_rate=1e-4,
    no_cuda=False,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dummy_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model, padding=True),
)
trainer.train()

# Manually check gradients
print("\nLoRA A/B gradients after backward:")
for name, param in model.named_parameters():
    if ("lora_A" in name or "lora_B" in name) and param.grad is not None:
        print(f"{name}: norm={param.grad.norm():.6f}")


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Model trainable parameters:
trainable params: 4,505,600 || all params: 1,104,553,984 || trainable%: 0.4079


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss



LoRA A/B gradients after backward:


In [4]:
import torch

# Tiny batch
batch = dummy_dataset[0]
inputs = {k: torch.tensor([v]).to(model.device) for k, v in batch.items()}

# Zero grads, forward, backward
model.train()
model.zero_grad()
out = model(**inputs)
loss = out.loss
loss.backward()

# Check grads for LoRA modules
grad_exists = False
for name, param in model.named_parameters():
    if "lora" in name:
        if param.grad is not None and param.grad.norm().item() > 0:
            print(f"Found gradient: {name}")
            grad_exists = True
            break

if not grad_exists:
    print("LoRA parameters still have zero/nan gradients after manual backward.")
    # Print some LoRA gradients manually
    for name, param in model.named_parameters():
        if "lora" in name:
            print(f"{name}: grad={param.grad}")


Found gradient: base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight


In [5]:
train_data[23]

{'input': 'Assessment: Moreover, you have taken in the preconception phase where if at all any harm to the fetus had to occur it would have led to an abortion, due to the "All or none" phenomenon, where if any defect is there it leads to miscarriage and if pregnancy continues it implies everything is normal. Hope this helps\nPlan: Hi, MMR is not advised to pregnant women, but if a pregnant lady has inadvertently taken MMR, she should continue the pregnancy, as no actual abnormality in the fetus is detected, if taken\nRewrite the above for a patient with no medical background.',
 'output': 'Hello,\n\nBased on the information you\'ve provided, it seems that you have recently become pregnant and received the MMR (Measles, Mumps, Rubella) vaccine around the time of conception. The doctor understands your concern about the vaccine\'s safety during pregnancy, but here\'s what they want to share with you:\n\n1. The MMR vaccine is generally not recommended for pregnant women due to potential r

In [13]:
# obligatory device set up
# Automatically select device: GPU 0 if available, else CPU
device = torch.device("cuda:0" if torch.cuda.is_available() and torch.cuda.device_count() >= 1 else "cpu")
print(f"Using device: {device}")


Using device: cuda:0


In [7]:
# model tokenizer

from datasets import Dataset
from transformers import AutoTokenizer

student_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(student_model_id)
tokenizer.padding_side = "right"
tokenizer.pad_token = tokenizer.eos_token

#model = AutoModelForCausalLM.from_pretrained(student_model_id, device_map="auto")


In [8]:
# tokenize the data 



train_dataset = Dataset.from_list(train_data)

def preprocess_function(example):
    # join input and output into one sequence
    full_text = example["input"] + example["output"]
    
    # Tokenize the full sequence
    model_inputs = tokenizer(
        full_text,
        max_length=420,
        truncation=True,
        padding="max_length"
    )

    # labels should be set to input_ids for clm, and copy is crucuia
    model_inputs["labels"] = model_inputs["input_ids"].copy()  
    
    return {
        "input_ids": model_inputs["input_ids"],
        "attention_mask": model_inputs["attention_mask"],
        "labels": model_inputs["labels"]
    }

tokenized_train = train_dataset.map(preprocess_function)


Map:   0%|          | 0/457 [00:00<?, ? examples/s]

In [9]:
len(tokenized_train)

457

In [10]:
tokenized_train = tokenized_train.remove_columns([col for col in tokenized_train.column_names if col not in ["input_ids", "attention_mask", "labels"]])
print(tokenized_train[0])
print(tokenized_train.column_names)

{'input_ids': [1, 4007, 404, 358, 29901, 4001, 596, 25828, 4835, 526, 21677, 2999, 1788, 322, 4152, 6987, 526, 8178, 29892, 727, 526, 4549, 24496, 393, 596, 25828, 4835, 1033, 367, 2861, 304, 14919, 21549, 13, 20334, 29901, 1019, 546, 14919, 21549, 14502, 297, 278, 883, 310, 19967, 339, 403, 8709, 29892, 4943, 321, 1218, 29892, 9128, 15058, 29892, 343, 14895, 322, 26681, 362, 338, 5181, 29889, 960, 278, 25828, 4835, 526, 22261, 322, 6403, 1407, 4049, 29892, 263, 3236, 310, 3677, 713, 29916, 21549, 13589, 800, 338, 7088, 1811, 29889, 2823, 263, 11619, 448, 263, 11643, 7163, 2021, 1058, 508, 2225, 29581, 366, 278, 3677, 713, 29916, 21549, 13589, 800, 13, 29934, 10540, 278, 2038, 363, 263, 16500, 411, 694, 16083, 3239, 29889, 29933, 1463, 373, 596, 25828, 4835, 6602, 292, 2999, 5633, 310, 596, 3573, 322, 694, 2821, 1284, 886, 515, 6987, 29892, 372, 29915, 29879, 5517, 393, 366, 1122, 367, 10623, 3277, 14919, 21549, 29889, 910, 4195, 508, 4556, 9128, 25828, 4835, 1316, 408, 2301, 2841, 678

In [75]:
# check if needed
print(tokenized_train[0])
print(tokenized_train.column_names)

{'input_ids': [1, 4007, 404, 358, 29901, 4001, 596, 25828, 4835, 526, 21677, 2999, 1788, 322, 4152, 6987, 526, 8178, 29892, 727, 526, 4549, 24496, 393, 596, 25828, 4835, 1033, 367, 2861, 304, 14919, 21549, 13, 20334, 29901, 1019, 546, 14919, 21549, 14502, 297, 278, 883, 310, 19967, 339, 403, 8709, 29892, 4943, 321, 1218, 29892, 9128, 15058, 29892, 343, 14895, 322, 26681, 362, 338, 5181, 29889, 960, 278, 25828, 4835, 526, 22261, 322, 6403, 1407, 4049, 29892, 263, 3236, 310, 3677, 713, 29916, 21549, 13589, 800, 338, 7088, 1811, 29889, 2823, 263, 11619, 448, 263, 11643, 7163, 2021, 1058, 508, 2225, 29581, 366, 278, 3677, 713, 29916, 21549, 13589, 800, 13, 29934, 10540, 278, 2038, 363, 263, 16500, 411, 694, 16083, 3239, 29889, 29933, 1463, 373, 596, 25828, 4835, 6602, 292, 2999, 5633, 310, 596, 3573, 322, 694, 2821, 1284, 886, 515, 6987, 29892, 372, 29915, 29879, 5517, 393, 366, 1122, 367, 10623, 3277, 14919, 21549, 29889, 910, 4195, 508, 4556, 9128, 25828, 4835, 1316, 408, 2301, 2841, 678

### clean up before train


In [22]:
#clean up before train

import torch
import gc

# delete any model/pipeline objects you won't reuse

try:
    del model
except NameError:
    pass  # ignore if not defined

try:
    del trainer
except NameError:
    pass  # ignore if not defined

gc.collect()

# free  GPU memory
torch.cuda.empty_cache()

### trainer set up

In [23]:
# trainer set up
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, default_data_collator
from peft import LoraConfig, get_peft_model
from transformers import DataCollatorForLanguageModeling

model = AutoModelForCausalLM.from_pretrained(student_model_id).to(device)

# LoRA configuration (you can tune r and alpha for resource/quality balance)
lora_config = LoraConfig(
    r=16,             # lore dimension - used 8 to fit memory
    lora_alpha=32,   # Scaling factor
    target_modules=["q_proj", "v_proj"],  # Adapter applied to these modules (may need to change for your model!)
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

#  model with LoRA adapters
model = get_peft_model(model, lora_config).to(device)
model.train()  # do this all the time

# count (should be low!)
model.print_trainable_parameters()


training_args = TrainingArguments(
    output_dir="./distilled-student-peft",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    save_steps=1000,
    save_total_limit=1,
    report_to="none",
    logging_steps=25,
    remove_unused_columns=False,  # needed for LoRA
    fp16=True,
    dataloader_num_workers=0,
    gradient_checkpointing=False,      
    max_grad_norm=1.0,
    optim="adamw_torch",         # 8-bit optimizer
)

'''
Alternate values
    optim="paged_adamw_8bit",         # 8-bit optimizer
    gradient_checkpointing=True,       # to help with memory

'''
small_tokenized_train = tokenized_train.select(range(50))  # used sometimes for quick check

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset= tokenized_train, 
    data_collator=default_data_collator,
    
)


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 2,252,800 || all params: 1,102,301,184 || trainable%: 0.2044


In [15]:
# clean up disk spaces too
import shutil
shutil.rmtree("./distilled-student-peft", ignore_errors=True)


### checks before doing the training
### these are based on learnings - initial runs did not update LORA weights
### Multiple issues found - not being in tensor format, not haivng grads, etc.

In [51]:
%%capture
!pip install -q bitsandbytes

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [24]:
# before the train, check if tokens are right
# all the below should be True
sample = tokenized_train[0]
print("Keys:", sample.keys())
print("Has 'labels'?", 'labels' in sample)
print("Labels == input_ids?", sample['labels'] == sample['input_ids'])


Keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
Has 'labels'? True
Labels == input_ids? True


In [17]:
print(model)

#check that LORA is correctly wrapped

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 2048)
        (layers): ModuleList(
          (0-21): 22 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear(in_fea

In [18]:
print("Labels sample:", tokenized_train['labels'][:10])  # Should not be all -100!


Labels sample: [[1, 4007, 404, 358, 29901, 4001, 596, 25828, 4835, 526, 21677, 2999, 1788, 322, 4152, 6987, 526, 8178, 29892, 727, 526, 4549, 24496, 393, 596, 25828, 4835, 1033, 367, 2861, 304, 14919, 21549, 13, 20334, 29901, 1019, 546, 14919, 21549, 14502, 297, 278, 883, 310, 19967, 339, 403, 8709, 29892, 4943, 321, 1218, 29892, 9128, 15058, 29892, 343, 14895, 322, 26681, 362, 338, 5181, 29889, 960, 278, 25828, 4835, 526, 22261, 322, 6403, 1407, 4049, 29892, 263, 3236, 310, 3677, 713, 29916, 21549, 13589, 800, 338, 7088, 1811, 29889, 2823, 263, 11619, 448, 263, 11643, 7163, 2021, 1058, 508, 2225, 29581, 366, 278, 3677, 713, 29916, 21549, 13589, 800, 13, 29934, 10540, 278, 2038, 363, 263, 16500, 411, 694, 16083, 3239, 29889, 29933, 1463, 373, 596, 25828, 4835, 6602, 292, 2999, 5633, 310, 596, 3573, 322, 694, 2821, 1284, 886, 515, 6987, 29892, 372, 29915, 29879, 5517, 393, 366, 1122, 367, 10623, 3277, 14919, 21549, 29889, 910, 4195, 508, 4556, 9128, 25828, 4835, 1316, 408, 2301, 2841, 6

In [21]:
#check if gradients flow through lora
# there should be non-zero gradient...

import torch


# Explicitly set LoRA weights to require gradients
for n, p in model.named_parameters():
    if 'lora_' in n:
        p.requires_grad = True

model.train()  # Ensure model is in train mode
batch = tokenized_train[30]
inputs = {k: torch.tensor([v]).to(device) for k, v in batch.items() if k in ['input_ids', 'attention_mask', 'labels']}

for k in range(5):
    outputs = model(**inputs)
    loss = outputs.loss
    loss.backward()
    
    # Check LoRA gradients
    for name, param in model.named_parameters():
        if 'lora_' in name and 'lora_A' in name:  # For A/B matrices
            grad = param.grad
            if grad is not None:
                print(f"LoRA A gradient norm: {grad.norm():.6f}")
            else:
                print(f"LoRA A gradient is None!")
            break  # Show one example


LoRA A gradient norm: 0.160904
LoRA A gradient norm: 0.326587
LoRA A gradient norm: 0.484545
LoRA A gradient norm: 0.651751
LoRA A gradient norm: 0.818384


## Main Trainer Code

### prints some weights before and after training

In [25]:
# now the actual training

#tokenized_train = tokenized_train.to("cuda:0")
#trainer = trainer.to("cuda:0")

model.enable_input_require_grads()

model.train()

# check weights BEFORE training
print("\nBefore training:")
for name, param in model.named_parameters():
    if 'lora_A' in name:
        print(f"{name}: min={param.min():.4f}, max={param.max():.4f}, mean={param.mean():.4f}")
        break

if len(tokenized_train) > 0:
    trainer.train()

else:
    print("ERROR: No train samples available for Trainer.")

# check weights AFTER training
print("\nAfter training:")
for name, param in model.named_parameters():
    if 'lora_A' in name:
        print(f"{name}: min={param.min():.4f}, max={param.max():.4f}, mean={param.mean():.4f}")
        break



Before training:
base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: min=-0.0221, max=0.0221, mean=0.0001


Step,Training Loss
25,2.658
50,0.9453
75,0.7055
100,0.6414
125,0.6558
150,0.5857



After training:
base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: min=-0.0251, max=0.0252, mean=0.0000


### Learning - check weights, etc. 
### Improper training may result in poor quality model

In [26]:
# check weights after training...
for name, param in model.named_parameters():
    if 'lora_B' in name:
        print(f"{name}:")
        print(f"  min={param.min():.4f}, max={param.max():.4f}, mean={param.mean():.4f}")
        # Also check std dev to see if weights are meaningful
        print(f"  std={param.std():.4f}")
        break


base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight:
  min=-0.0041, max=0.0040, mean=0.0000
  std=0.0013


In [27]:
# check codes
for name, param in model.named_parameters():
    if 'lora' in name:
        print(f"{name}: requires_grad={param.requires_grad}")
        break

# model mode
print(f"Model training mode: {model.training}")

base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: requires_grad=True
Model training mode: True


### now save the models 

In [28]:
#model.save_pretrained("distilled-student-model")
#tokenizer.save_pretrained("distilled-student-model")

model.save_pretrained("distilled-student-peft-adapter")
tokenizer.save_pretrained("distilled-student-peft-adapter")

('distilled-student-peft-adapter/tokenizer_config.json',
 'distilled-student-peft-adapter/special_tokens_map.json',
 'distilled-student-peft-adapter/tokenizer.model',
 'distilled-student-peft-adapter/added_tokens.json',
 'distilled-student-peft-adapter/tokenizer.json')

In [41]:
# as usual, enable the files for downlaod

import shutil
import os

# Path to your output directory
output_dir = "./distilled-student-peft-adapter"
zip_path = "./distilled-student-peft.zip"

# Zip entire folder
shutil.make_archive(output_dir, 'zip', output_dir)


'/kaggle/working/distilled-student-peft-adapter.zip'

In [52]:
#clean up before inference

import torch
import gc

# delete any model/pipeline objects you won't reuse

try:
    del model
except NameError:
    pass  # ignore if not defined

try:
    del trainer
except NameError:
    pass  # ignore if not defined

gc.collect()

# free  GPU memory
torch.cuda.empty_cache()

In [53]:
gc.collect()

0

# Inference code 

## inference on train set, and also test set
## evaluate metrics

In [54]:
# first the model



from transformers import pipeline

#infer_pipe = pipeline("text-generation", model="distilled-student-peft-adapter", 
#                      tokenizer="distilled-student-peft-adapter", device=0)

# need to use peft load  
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
adapter_path = "distilled-student-peft-adapter"   # change as needed

tokenizer = AutoTokenizer.from_pretrained(adapter_path)
base_model = AutoModelForCausalLM.from_pretrained(model_id)

# IMPORTANT: Use is_trainable=True for proper adapter loading
model = PeftModel.from_pretrained(base_model, adapter_path, is_trainable=True).to(device)

print("Trainable params after load:")
model.print_trainable_parameters()
print("Compare with previous data to check match...")

Trainable params after load:
trainable params: 2,252,800 || all params: 1,102,301,184 || trainable%: 0.2044
Compare with previous data to check match...


### sample inference
### check reasonableness before batch inference

In [55]:
import torch
# Inference example

prompt = f"""Assessment: You have typhoiditis
Plan: Take paracetamol and acetenomycin
Rewrite the above for a patient with no medical background."""


inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
    gen_output = model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=True,
        top_p=0.95,
        temperature=0.7,
        eos_token_id=tokenizer.eos_token_id
    )


# Full output includes the prompt + generated text
full_output = tokenizer.decode(gen_output[0], skip_special_tokens=True)

# Extract only the generated part (remove prompt)
generated_text = full_output[len(prompt):].strip()

print("Prompt: ", prompt)
print("\nGenerated summary:")
print(generated_text)


Prompt:  Assessment: You have typhoiditis
Plan: Take paracetamol and acetenomycin
Rewrite the above for a patient with no medical background.

Generated summary:
Your patient has been diagnosed with typhoiditis. The doctor recommends taking paracetamol and acetylmycin to treat the infection. These medications are commonly used for typhoid fever. The patient can take them for a few days to help manage the symptoms.


### Load the Teacher data

In [56]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import json

#change to required paths...
teacher_file = "/kaggle/input/teachersummary/patient_summary_finetune_data_allrecords.jsonl"

with open(teacher_file, "r") as f:
    teacher_data = [json.loads(line) for line in f]

print("Full data has...", teacher_data[0].keys())
print("Input has ...",teacher_data[0]['input'].keys())
print("Sample output ...", teacher_data[0]['output'][:150])  # preview



Full data has... dict_keys(['input', 'output'])
Input has ... dict_keys(['subjective', 'objective', 'assessment', 'plan'])
Sample output ... Hello,

Your father-in-law's creatinine level is high, which suggests a potential kidney issue. To better understand the cause and extent of the probl


### train test split - same as before 
### do the inference separately for train and test, and store the outputs
### metrics computation done separately 


In [57]:
# helper function to infer as a batch

import torch

def infer_and_save(data, ofile):
    # data - list format data for inference
    # ofile - name of output file 
        
    ### create prompt string - should match with training setup

    def build_prompt(record):
        assessment = record["input"]["assessment"].strip()
        plan = record["input"]["plan"].strip()
        prompt = f"Assessment: {assessment}\nPlan: {plan}\nRewrite the above for a patient with no medical background."
        return prompt
    
    prompts = [build_prompt(rec) for rec in data]
    
    ### batch inference
    
    BATCH_SIZE = 24
    MAX_NEW_TOKENS = 256   # adjusted to fit into GPU
    
    def batch_infer(model, tokenizer, prompts, batch_size=BATCH_SIZE):
        
        preds = []
        for i in range(0, len(prompts), batch_size):
            #print("next batch called")
            batch_prompts = prompts[i:i+batch_size]
            tokens = tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True).to(model.device)
            with torch.no_grad():
                output = model.generate(**tokens, max_new_tokens=MAX_NEW_TOKENS)
            preds.extend(tokenizer.batch_decode(output, skip_special_tokens=True))
        return preds
    
    predictions_raw = batch_infer(model, tokenizer, prompts)
    predictions = []
    REWRITE_CUE = 'Rewrite the above for a patient with no medical background.'
    
    for prompt, output in zip(prompts, predictions_raw):	
    
        if REWRITE_CUE in output:
            pred = output.split(REWRITE_CUE, 1)[-1].strip()
        else:
        # Fallback: try removing the whole prompt, but be careful
            pred = output[len(prompt):].strip() if output.startswith(prompt) else output.strip()
        # If prediction is still blank after these checks, log it explicitly for debugging
        if not pred:
            print('Blank prediction after stripping:', repr(output[:80]))    
        
        predictions.append(pred)
    
    ### save the predictions (and reference) for metrics
    
    import pandas as pd
    
    df = pd.DataFrame({
        "prompt": prompts,
        "reference": [rec["output"] for rec in data],
        "prediction": predictions
    })
    df.to_csv(ofile, index=False)


In [58]:
trdata, tsdata = train_test_split(teacher_data, train_size=0.5, random_state=42, shuffle=True)

troutput = "student_train_output.csv"

print("Starting train inference")
infer_and_save(trdata, troutput)

Starting train inference
Blank prediction after stripping: 'Assessment: I understand your anxiety. The pull-out method is not 100% foolproof'
Blank prediction after stripping: 'Assessment: Covers may be started with 2. If blood pressure is not controlled wi'
Blank prediction after stripping: 'Assessment: Yes you can take Tylenol for muscle pain. Be aware of side effects o'
Blank prediction after stripping: 'Assessment: Your current symptom may be related to residual depression\nPlan: In '
Blank prediction after stripping: 'Assessment: Hello, I would explain that your symptoms could be suggestive of thy'
Blank prediction after stripping: 'Assessment: Yes you can take Tylenol for muscle pain. Be aware of side effects o'
Blank prediction after stripping: "Assessment: You didn't mention your age or vascular status. In the worst case sc"
Blank prediction after stripping: 'Assessment: Moreover, you have taken in the preconception phase where if at all '
Blank prediction after stripping: 'Ass

In [59]:
tsoutput = "student_eval_output.csv"

print("Starting test inference")
infer_and_save(tsdata, tsoutput)

Starting test inference
Blank prediction after stripping: 'Assessment: He might be having Pharyngitis/Tonsillitis?  You can do the followin'
Blank prediction after stripping: 'Assessment: This means that it increases when there is any inflammation such as '
Blank prediction after stripping: 'Assessment: Understanding your concern. As per your query you have psoriasis art'
Blank prediction after stripping: 'Assessment: You can be advised treatments like Root Canal Treatment or Extractio'
Blank prediction after stripping: 'Assessment: -Once the diagnosis has been made, there may be a need of surgery, a'
Blank prediction after stripping: 'Assessment: Dear mam, pill has to be taken within 72 hours the earlier, the bett'
Blank prediction after stripping: 'Assessment: By your history, in my opinion you are having dyspnea on minimal exe'
Blank prediction after stripping: 'Assessment: Your description is telling that you have chronic allergy problem. Y'
Blank prediction after stripping: "Asses

## Metrics
### Use predictions and references from saved files
### A good practice seen in literature

In [42]:
%%capture
!pip install evaluate bert_score rouge_score


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## Eval and metrics

### first check for empty predictions

In [60]:
import pandas as pd

results = pd.read_csv("student_eval_output.csv")

# Identify NaN predictions  - there are indeed quite a few...
nan_mask = results["prediction"].isna()
num_nan = nan_mask.sum()
total = len(results)
print(f"NaN (missing) predictions: {num_nan} / {total} ({num_nan/total:.1%})")

# Mark, but keep in dataset for full analysis
results["is_nan"] = nan_mask

# Drop NaNs before metric calculation
non_nan_results = results[~nan_mask]
preds = non_nan_results["prediction"].tolist()
refs = non_nan_results["reference"].tolist()


NaN (missing) predictions: 207 / 472 (43.9%)


### Do simple metrics

In [61]:
from evaluate import load
rouge = load("rouge")
bertscore = load("bertscore")

if len(preds) > 0:
    rouge_scores = rouge.compute(predictions=preds, references=refs)
    bertscore_out = bertscore.compute(predictions=preds, references=refs, lang="en")
    print("ROUGE-L:", rouge_scores["rougeL"])
    print("Mean BERTScore F1:", sum(bertscore_out['f1']) / len(bertscore_out['f1']))
else:
    print("No non-blank predictions to score.")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ROUGE-L: 0.23268499823369582
Mean BERTScore F1: 0.8752007360728282


In [48]:
import numpy as np
exact_match_rate = np.mean([p.strip() == r.strip() for p, r in zip(preds, refs)])
print(f"Exact Match %: {100 * exact_match_rate:.2f}%")


Exact Match %: 0.00%


In [62]:
#optional - medical term retention check
# needs domain specific list of terms
# sample is below...

medical_terms = {"creatinine", "dialysis", "kidney", "diabetes", "nephrologyst", "GFR",
                 "urea", "uric acid", "anion gap", "electrolytes", "renal biopsy"}

def retain_count(pred, ref, terms):
    ref_terms = [term for term in terms if term in ref.lower()]
    pred_terms = [term for term in ref_terms if term in pred.lower()]
    return len(pred_terms), len(ref_terms)

retained, total = 0, 0
for p, r in zip(preds, refs):
    rc, tc = retain_count(p, r, medical_terms)
    retained += rc
    total += tc
retain_rate = retained / total if total > 0 else 1.0
print(f"Medical term retain rate: {100 * retain_rate:.2f}%")


### Interpretation of the above can be mixed
### We trained the model to give plain output
### It won't be surprising if some medical terms are simplified

# Full Metric Comparison

## Done for train and test data

In [63]:
import pandas as pd
import numpy as np
from evaluate import load

# needs domain specific list of terms
# sample is below...
medical_terms = {"creatinine", "dialysis", "kidney", "diabetes", "nephrologyst", "GFR",
                 "urea", "uric acid", "anion gap", "electrolytes", "renal biopsy"}

def load_and_prepare(fname):
    df = pd.read_csv(fname)
    preds = df["prediction"].fillna("").astype(str).tolist()
    refs = df["reference"].fillna("").astype(str).tolist()
    nan_mask = (df["prediction"].isna() | (df["prediction"].str.strip() == ""))
    return preds, refs, nan_mask

def compute_metrics(preds, refs, nan_mask, terms):
    metrics = {}
    # filter for empty predictions
    preds_nonan = [p for i,p in enumerate(preds) if not nan_mask.iloc[i]]
    refs_nonan = [r for i,r in enumerate(refs) if not nan_mask.iloc[i]]
    count = len(preds_nonan)
    metrics["Total"] = len(preds)
    metrics["Valid"] = count
    metrics["NaN_rate"] = 1 - count/len(preds) if len(preds) else 0
    # calculate metrics ignoring empty
    if count > 0:
        rouge = load("rouge")
        rouge_scores = rouge.compute(predictions=preds_nonan, references=refs_nonan)
        metrics["ROUGE-L"] = rouge_scores["rougeL"]
        bertscore = load("bertscore")
        bs_scores = bertscore.compute(predictions=preds_nonan, references=refs_nonan, lang="en")
        metrics["BERTScore_F1"] = float(np.mean(bs_scores['f1']))
        metrics["Exact_Match"] = float(np.mean([p.strip()==r.strip() for p,r in zip(preds_nonan,refs_nonan)]))
        # medical terms retention
        def retain_count(pred, ref, terms):
            ref_terms = [term for term in terms if term in ref.lower()]
            pred_terms = [term for term in ref_terms if term in pred.lower()]
            return len(pred_terms), len(ref_terms)
        retained, total = 0, 0
        for p, r in zip(preds_nonan, refs_nonan):
            rc, tc = retain_count(p, r, terms)
            retained += rc
            total += tc
        metrics["Medical_Retain"] = retained/total if total>0 else 1.0
        # Hallucination detection   - code leveraged; metric to be interpreted
        def find_hallucinated(pred, ref, terms):
            pred_terms = {term for term in terms if term in pred.lower()}
            ref_terms = {term for term in terms if term in ref.lower()}
            return len(pred_terms - ref_terms)
        halluc_count = sum(find_hallucinated(p,r,terms)>0 for p,r in zip(preds_nonan,refs_nonan))
        metrics["Hallucination_rate"] = halluc_count/count if count>0 else 0
    else:
        metrics.update({"ROUGE-L":0,"BERTScore_F1":0,"Exact_Match":0,"Medical_Retain":0,"Hallucination_rate":0})
    return metrics

# Load train set
train_preds, train_refs, train_nan = load_and_prepare("/kaggle/working/student_train_output.csv")
train_metrics = compute_metrics(train_preds, train_refs, train_nan, medical_terms)

# Load test set
test_preds, test_refs, test_nan = load_and_prepare("/kaggle/working/student_eval_output.csv")
test_metrics = compute_metrics(test_preds, test_refs, test_nan, medical_terms)

# Pretty print as a table
import pandas as pd
report = pd.DataFrame([train_metrics, test_metrics], index=["Train","Test"]).T
print(report)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                         Train        Test
Total               471.000000  472.000000
Valid               278.000000  265.000000
NaN_rate              0.409766    0.438559
ROUGE-L               0.235115    0.232685
BERTScore_F1          0.875770    0.875201
Exact_Match           0.000000    0.000000
Medical_Retain        0.320000    0.642857
Hallucination_rate    0.014388    0.011321


#### perplexity's comments on the metrics!

Interpreting  Model's Evaluation Results
Let's walk through how you might interpret these results, especially for an academic healthcare NLP project:

ROUGE-L: 0.266
Moderate recall for medical summarization. For abstractive summaries of clinical notes, a ROUGE-L score of about 0.27 suggests your model is picking up relevant sequences but isn't perfectly aligned with reference summaries.

Typical range for strong models in medical summarization can be 0.3‚Äì0.4+, though this depends on dataset complexity and how much reference and prediction texts diverge. This means your model performs reasonably but could benefit from further tuning.

Mean BERTScore F1: 0.89
High semantic similarity! This score is strong‚ÄîBERTScore captures meaning rather than just word overlap, and scores above 0.85 typically suggest outputs remain faithful in meaning to the references, even if phrasing differs.

Even with a moderate ROUGE-L, a high BERTScore suggests the model paraphrases well, retaining important clinical concepts and descriptions.

56% NaN (Missing Predictions)
This is concerning. More than half your samples resulted in missing predictions, which is not typical. In practice, you'd want this to be under 5‚Äì10%.

Common causes:

Model or tokenization errors on certain prompts.

Very long inputs or aggressive truncation.

Model or hardware resource limits (memory, timeouts).

Bugs in inference (e.g., some prompts producing no output, or mishandling the results after generation).

Next Action: Investigate why predictions are missing. Check logs, input formatting, and try generating a small batch of the problematic prompts directly to debug.

If you fix this, you'll likely see your ROUGE-L and BERTScore change‚Äîin particular, removing missing outputs will give you metrics that better reflect overall quality.

Quick Summary
BERTScore F1 (0.89): Semantic meaning is well preserved where the model produces output.

ROUGE-L (0.27): Linguistic overlap is moderate‚Äîparaphrasing or missing references may account for the gap.

NaN rate (56%): Suggests the evaluation is incomplete; addressing this will be your next critical step.


## Reference code - for later usage

In [None]:
# One set of evaluation...


# ============================================================================
# COMPREHENSIVE EVALUATION FRAMEWORK FOR TEACHER-STUDENT MEDICAL NLP MODEL
# ============================================================================
# This framework evaluates a TinyLlama student model fine-tuned with LoRA
# for converting clinical SOAP notes to patient-friendly summaries

import torch
import numpy as np
from typing import Dict, List, Tuple
from dataclasses import dataclass
import json
from datetime import datetime

# Required libraries
# pip install rouge-score
# pip install bert-score
# pip install evaluate
# pip install transformers
# pip install torch

from rouge_score import rouge_scorer
from bert_score import score as bert_score
import evaluate

# ============================================================================
# 1. AUTOMATED METRICS CLASS
# ============================================================================

@dataclass
class AutomatedMetrics:
    """Compute automated evaluation metrics for text generation tasks"""

    def __init__(self, use_stemmer: bool = True):
        self.rouge_scorer = rouge_scorer.RougeScorer(
            ['rouge1', 'rouge2', 'rougeL'], 
            use_stemmer=use_stemmer
        )
        self.perplexity_metric = evaluate.load("perplexity", module_type="metric")
        self.bleu_metric = evaluate.load("bleu")

    def compute_rouge(self, generated: str, reference: str) -> Dict[str, float]:
        """
        Compute ROUGE scores (F1-score focused)
        Best for: Medical note summarization tasks

        ROUGE-1: Unigram overlap (individual words)
        ROUGE-2: Bigram overlap (word pairs)  
        ROUGE-L: Longest common subsequence (semantic coherence)
        """
        scores = self.rouge_scorer.score(reference, generated)

        return {
            'rouge1_f1': scores['rouge1'].fmeasure,
            'rouge2_f1': scores['rouge2'].fmeasure,
            'rougeL_f1': scores['rougeL'].fmeasure,
       

In [None]:
from rouge_score import rouge_scorer
from sacrebleu import corpus_bleu
from bert_score import score as bert_score_fn
import textstat

refs = [row["output"] for row in train_data]
preds = small_df["student_output"].tolist()

rouge = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
r1, rL = [], []
for pred, ref in zip(preds, refs):
    scores = rouge.score(str(pred), str(ref))
    r1.append(scores["rouge1"].fmeasure)
    rL.append(scores["rougeL"].fmeasure)
print("ROUGE-1 avg:", sum(r1)/len(r1))
print("ROUGE-L avg:", sum(rL)/len(rL))
bleu = corpus_bleu(preds, [refs])
print("BLEU:", bleu.score)
P, R, F1 = bert_score_fn(preds, refs, lang="en")
print("BERTScore F1 avg:", F1.mean().item())
read_ease = [textstat.flesch_reading_ease(txt) for txt in preds]
fk_grade = [textstat.flesch_kincaid_grade(txt) for txt in preds]
print("Avg Flesch Ease:", sum(read_ease)/len(read_ease))
print("Avg FK Grade:", sum(fk_grade)/len(fk_grade))


In [None]:
import bitsandbytes as bnb
print(bnb.__version__)