In [2]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, logging
import torch
import numpy as np
import json
import csv
from datetime import datetime


logging.set_verbosity_error()

def initialize_model():
    """Initialize BERT model and tokenizer with proper configuration"""
    model_name = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
   
    model = AutoModelForMaskedLM.from_pretrained(
        model_name,
        output_attentions=False,
        output_hidden_states=False
    )
    
    
    model.eval()
    
    return tokenizer, model

def analyze_text_authenticity(paragraph, num_red_tokens=10):
    
    tokenizer, model = initialize_model()
    

    tokens = tokenizer.tokenize(paragraph)
    
  
    red_indices = np.random.choice(len(tokens), num_red_tokens, replace=False)
    
    # Mark tokens and create the masked version
    marked_tokens = tokens.copy()
    for idx in red_indices:
        marked_tokens[idx] = '[MASK]'

    inputs = tokenizer.encode_plus(
        ' '.join(marked_tokens),
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    )
    

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        mask_positions = (inputs['input_ids'] == tokenizer.mask_token_id)[0]
        
        predictions = []
        for pos in torch.where(mask_positions)[0]:
            probs = torch.softmax(logits[0, pos], dim=-1)
            top_probs, top_tokens = torch.topk(probs, k=5)
            
            original_token = tokens[red_indices[len(predictions)]]
            predicted_token = tokenizer.convert_ids_to_tokens(top_tokens[0].item())
            
          
            top_5_predictions = []
            for i in range(5):
                token = tokenizer.convert_ids_to_tokens(top_tokens[i].item())
                prob = top_probs[i].item()
                top_5_predictions.append({'token': token, 'probability': prob})
            
            predictions.append({
                'original': original_token,
                'predicted': predicted_token,
                'probability': top_probs[0].item(),
                'is_match': original_token == predicted_token,
                'top_5_predictions': top_5_predictions
            })
    
 
    correct_predictions = sum(1 for p in predictions if p['is_match'])
    total_predictions = len(predictions)
    accuracy = correct_predictions / total_predictions
    
 
    is_llm_generated = accuracy > 0.5
    
    results = {
        'predictions': predictions,
        'accuracy': accuracy,
        'is_llm_generated': is_llm_generated,
        'marked_text': ' '.join(marked_tokens),
        'original_text': paragraph
    }
    
    save_results(results)
    
    return results

def save_results(results):
  
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
   
    json_filename = f'text_analysis_results_{timestamp}.json'
    with open(json_filename, 'w') as f:
        json.dump(results, f, indent=2)
    
   
    csv_filename = f'predicted_words_{timestamp}.csv'
    with open(csv_filename, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['Original Word', 'Predicted Word', 'Probability', 'Is Match', 
                        'Top 5 Predictions'])
        
        for pred in results['predictions']:
            top_5_str = ' | '.join([f"{p['token']}({p['probability']:.3f})" 
                                  for p in pred['top_5_predictions']])
            writer.writerow([
               pred['original'],
                pred['predicted'],
                f"{pred['probability']:.3f}",
                pred['is_match'],
                top_5_str
            ])


if __name__ == "__main__":
    sample_paragraph = """A paragraph is defined as “a group of sentences or a single sentence that forms a unit” (Lunsford and Connors 116). Length and appearance do not determine whether a section in a paper is a paragraph. For instance, in some styles of writing, particularly journalistic styles, a paragraph can be just one sentence long."""

    results = analyze_text_authenticity(sample_paragraph)
    print(f"Is LLM generated: {results['is_llm_generated']}")
    print(f"Accuracy: {results['accuracy']:.2f}")
    print("\nPredictions saved to files:")
    print("- Detailed JSON report: text_analysis_results_[timestamp].json")
    print("- Predicted words CSV: predicted_words_[timestamp].csv")

KeyboardInterrupt: 