In [1]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/parthabhang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Using BERT 

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, logging
import torch
import numpy as np
import json
import csv
from datetime import datetime
import re
from nltk.corpus import stopwords

logging.set_verbosity_error()

def initialize_model():
    """Initialize BERT model and tokenizer with proper configuration"""
    model_name = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = AutoModelForMaskedLM.from_pretrained(
        model_name,
        output_attentions=False,
        output_hidden_states=False
    )

    model.eval()
    return tokenizer, model

def preprocess_text(paragraph):
    stop_words = set(stopwords.words('english'))
    words = re.findall(r'\w+|[^\w\s]', paragraph.lower()) 
    return words, stop_words

def analyze_text_authenticity(paragraph, num_red_tokens=10):
    tokenizer, model = initialize_model()

    words, stop_words = preprocess_text(paragraph)

    if num_red_tokens > len(words):
        num_red_tokens = len(words)

    word_indices = [i for i, word in enumerate(words) if re.match(r'\w+', word) and word not in stop_words]
    mask_indices = np.random.choice(word_indices, num_red_tokens, replace=False)

    marked_words = words.copy()
    for idx in mask_indices:
        marked_words[idx] = '[MASK]'

    marked_text = ' '.join(marked_words)

    inputs = tokenizer.encode_plus(
        marked_text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    )

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        mask_positions = inputs['input_ids'][0] == tokenizer.mask_token_id

        predictions = []
        for pos in torch.where(mask_positions)[0]:
            probs = torch.softmax(logits[0, pos], dim=-1)
            top_probs, top_tokens = torch.topk(probs, k=5)
            original_word = words[mask_indices[len(predictions)]]
            predicted_word = tokenizer.convert_ids_to_tokens(top_tokens[0].item())

            top_5_predictions = [
                {'token': tokenizer.convert_ids_to_tokens(top_tokens[i].item()), 'probability': top_probs[i].item()}
                for i in range(5)
            ]

            predictions.append({
                'original': original_word,
                'predicted': predicted_word,
                'probability': top_probs[0].item(),
                'is_match': original_word == predicted_word,
                'top_5_predictions': top_5_predictions
            })

    correct_predictions = sum(1 for p in predictions if p['is_match'])
    total_predictions = len(predictions)
    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0

    is_llm_generated = accuracy > 0.5

    results = {
        'predictions': predictions,
        'accuracy': accuracy,
        'is_llm_generated': is_llm_generated,
        'marked_text': marked_text,
        'original_text': ' '.join(words)
    }

    save_results(results)
    return results

def save_results(results):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    json_filename = f'text_analysis_results_{timestamp}.json'
    with open(json_filename, 'w') as f:
        json.dump(results, f, indent=2)

    csv_filename = f'predicted_words_{timestamp}.csv'
    with open(csv_filename, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['Original Word', 'Predicted Word', 'Probability', 'Is Match', 
                         'Top 5 Predictions'])

        for pred in results['predictions']:
            top_5_str = ' | '.join([f"{p['token']}({p['probability']:.3f})" 
                                     for p in pred['top_5_predictions']])
            writer.writerow([
                pred['original'],
                pred['predicted'],
                f"{pred['probability']:.3f}",
                pred['is_match'],
                top_5_str
            ])

if __name__ == "__main__":
    sample_paragraph = """In the quiet solitude of the evening, as the last rays of the sun dipped below the horizon, the world seemed to slow, giving way to a sense of peace that had been absent throughout the day. The soft rustling of leaves in the breeze and the distant chirping of crickets were the only sounds that filled the air, creating a symphony of tranquility. It was in these fleeting moments, between the fading light and the onset of darkness, that one could feel a deep connection to the rhythm of nature, reminding them of the simple beauty that exists when the world pauses to breathe."""

    results = analyze_text_authenticity(sample_paragraph)
    print(f"\nParagraph with masked words:\n{results['marked_text']}")
    print(f"\nIs LLM generated: {results['is_llm_generated']}")
    print(f"Accuracy: {results['accuracy']:.2f}")
    print("\nPredictions saved to files:")
    print("- Detailed JSON report: text_analysis_results_[timestamp].json")
    print("- Predicted words CSV: predicted_words_[timestamp].csv")


## Using GPT2

In [8]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import re
from datetime import datetime
import json
import csv
from nltk.corpus import stopwords
import nltk

# Ensure that stopwords are downloaded
nltk.download('stopwords')

def initialize_model():
    """Initialize GPT-2 model and tokenizer with proper configuration"""
    model_name = "gpt2"
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token  # Use EOS token as padding token

    model = GPT2LMHeadModel.from_pretrained(model_name)
    model.eval()
    return tokenizer, model

def preprocess_text(paragraph):
    """Preprocess text by removing stop words and tokenizing the text"""
    stop_words = set(stopwords.words('english'))
    words = re.findall(r'\w+|[^\w\s]', paragraph.lower()) 
    return words, stop_words

def analyze_text_authenticity(paragraph):
    """Analyze text by predicting next word and determining authenticity"""
    tokenizer, model = initialize_model()

    words, stop_words = preprocess_text(paragraph)

    # Tokenize the original text for input
    inputs = tokenizer(paragraph, return_tensors="pt", padding=True, truncation=True, max_length=512)

    marked_words = words.copy()

    # Tokenize and predict the next word for each word in the paragraph
    predictions = []
    for i in range(len(words)):
        # Create input text with the first i words
        context_text = ' '.join(words[:i])
        
        # Tokenize the context text
        context_input = tokenizer(context_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        
        # Check if the input is not empty
        if context_input['input_ids'].size(1) == 0:
            print(f"Skipping empty tokenization for context: '{context_text}'")
            continue
        
        context_input_ids = context_input['input_ids'].squeeze(0).unsqueeze(0)  # Add batch dimension

        with torch.no_grad():
            # Get model predictions for the next word
            outputs = model(context_input_ids)
            logits = outputs.logits

            # Get the top 5 predicted next words (topk)
            topk_values, topk_indices = torch.topk(logits[0, -1], k=5, dim=-1)

            top_5_predictions = []
            for idx, val in zip(topk_indices, topk_values):
                predicted_word = tokenizer.decode(idx)
                probability = torch.softmax(val, dim=-1).item()
                top_5_predictions.append({
                    'token': predicted_word,
                    'probability': probability
                })

            original_word = words[i]
            predicted_word = tokenizer.decode(topk_indices[0])  # The top prediction
            predictions.append({
                'original': original_word,
                'predicted': predicted_word,
                'is_match': original_word == predicted_word,
                'top_5_predictions': top_5_predictions
            })

    # Calculate accuracy
    correct_predictions = sum(1 for p in predictions if p['is_match'])
    total_predictions = len(predictions)
    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0

    is_llm_generated = accuracy > 0.5  # Determine if the text is likely generated by a language model

    results = {
        'predictions': predictions,
        'accuracy': accuracy,
        'is_llm_generated': is_llm_generated,
        'marked_text': ' '.join(marked_words),
        'original_text': paragraph
    }

    save_results(results)
    return results

def save_results(results):
    """Save results to JSON and CSV files"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Save results to a JSON file
    json_filename = f'text_analysis_results_{timestamp}.json'
    with open(json_filename, 'w') as f:
        json.dump(results, f, indent=2)

    # Save predictions to a CSV file
    csv_filename = f'predicted_words_{timestamp}.csv'
    with open(csv_filename, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['Original Word', 'Predicted Word', 'Is Match', 'Top 5 Predictions'])

        for pred in results['predictions']:
            top_5_str = ' | '.join([f"{p['token']}({p['probability']:.4f})" 
                                     for p in pred['top_5_predictions']])
            writer.writerow([
                pred['original'],
                pred['predicted'],
                pred['is_match'],
                top_5_str
            ])

if __name__ == "__main__":
    # Sample paragraph for testing
    sample_paragraph = """In the quiet solitude of the evening, as the last rays of the sun dipped below the horizon, the world seemed to slow, giving way to a sense of peace that had been absent throughout the day. The soft rustling of leaves in the breeze and the distant chirping of crickets were the only sounds that filled the air, creating a symphony of tranquility. It was in these fleeting moments, between the fading light and the onset of darkness, that one could feel a deep connection to the rhythm of nature, reminding them of the simple beauty that exists when the world pauses to breathe."""

    # Analyze text authenticity
    results = analyze_text_authenticity(sample_paragraph)
    print(f"\nParagraph with masked words:\n{results['marked_text']}")
    print(f"\nIs LLM generated: {results['is_llm_generated']}")
    print(f"Accuracy: {results['accuracy']:.2f}")
    print("\nPredictions saved to files:")
    print("- Detailed JSON report: text_analysis_results_[timestamp].json")
    print("- Predicted words CSV: predicted_words_[timestamp].csv")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/parthabhang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Skipping empty tokenization for context: ''

Paragraph with masked words:
in the quiet solitude of the evening , as the last rays of the sun dipped below the horizon , the world seemed to slow , giving way to a sense of peace that had been absent throughout the day . the soft rustling of leaves in the breeze and the distant chirping of crickets were the only sounds that filled the air , creating a symphony of tranquility . it was in these fleeting moments , between the fading light and the onset of darkness , that one could feel a deep connection to the rhythm of nature , reminding them of the simple beauty that exists when the world pauses to breathe .

Is LLM generated: False
Accuracy: 0.01

Predictions saved to files:
- Detailed JSON report: text_analysis_results_[timestamp].json
- Predicted words CSV: predicted_words_[timestamp].csv
