In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/parthabhang/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, logging
import torch
import numpy as np
import json
import csv
from datetime import datetime
import re
from nltk.corpus import stopwords

logging.set_verbosity_error()

def initialize_model():
    """Initialize BERT model and tokenizer with proper configuration"""
    model_name = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = AutoModelForMaskedLM.from_pretrained(
        model_name,
        output_attentions=False,
        output_hidden_states=False
    )

    model.eval()
    return tokenizer, model

def preprocess_text(paragraph):
    stop_words = set(stopwords.words('english'))
    words = re.findall(r'\w+|[^\w\s]', paragraph.lower()) 
    return words, stop_words

def analyze_text_authenticity(paragraph, num_red_tokens=10):
    tokenizer, model = initialize_model()

    words, stop_words = preprocess_text(paragraph)

    if num_red_tokens > len(words):
        num_red_tokens = len(words)

    word_indices = [i for i, word in enumerate(words) if re.match(r'\w+', word) and word not in stop_words]
    mask_indices = np.random.choice(word_indices, num_red_tokens, replace=False)

    marked_words = words.copy()
    for idx in mask_indices:
        marked_words[idx] = '[MASK]'

    marked_text = ' '.join(marked_words)

    inputs = tokenizer.encode_plus(
        marked_text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    )

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        mask_positions = inputs['input_ids'][0] == tokenizer.mask_token_id

        predictions = []
        for pos in torch.where(mask_positions)[0]:
            probs = torch.softmax(logits[0, pos], dim=-1)
            top_probs, top_tokens = torch.topk(probs, k=5)
            original_word = words[mask_indices[len(predictions)]]
            predicted_word = tokenizer.convert_ids_to_tokens(top_tokens[0].item())

            top_5_predictions = [
                {'token': tokenizer.convert_ids_to_tokens(top_tokens[i].item()), 'probability': top_probs[i].item()}
                for i in range(5)
            ]

            predictions.append({
                'original': original_word,
                'predicted': predicted_word,
                'probability': top_probs[0].item(),
                'is_match': original_word == predicted_word,
                'top_5_predictions': top_5_predictions
            })

    correct_predictions = sum(1 for p in predictions if p['is_match'])
    total_predictions = len(predictions)
    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0

    is_llm_generated = accuracy > 0.5

    results = {
        'predictions': predictions,
        'accuracy': accuracy,
        'is_llm_generated': is_llm_generated,
        'marked_text': marked_text,
        'original_text': ' '.join(words)
    }

    save_results(results)
    return results

def save_results(results):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    json_filename = f'text_analysis_results_{timestamp}.json'
    with open(json_filename, 'w') as f:
        json.dump(results, f, indent=2)

    csv_filename = f'predicted_words_{timestamp}.csv'
    with open(csv_filename, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['Original Word', 'Predicted Word', 'Probability', 'Is Match', 
                         'Top 5 Predictions'])

        for pred in results['predictions']:
            top_5_str = ' | '.join([f"{p['token']}({p['probability']:.3f})" 
                                     for p in pred['top_5_predictions']])
            writer.writerow([
                pred['original'],
                pred['predicted'],
                f"{pred['probability']:.3f}",
                pred['is_match'],
                top_5_str
            ])

if __name__ == "__main__":
    sample_paragraph = """My name is Parth. I am a boy. I love to watch football and formula 1. My favourite team is Real Madrid in football and Mercedes in Formula 1. I play at a center forward position in football. I love coding. I have built many projects. Some of them are House marketplace and Github finder along with IMDB sentiment analysis and Churn prediction. I am learning generative AI. I am searching for internships to work in during the summer holidays."""

    results = analyze_text_authenticity(sample_paragraph)
    print(f"\nParagraph with masked words:\n{results['marked_text']}")
    print(f"\nIs LLM generated: {results['is_llm_generated']}")
    print(f"Accuracy: {results['accuracy']:.2f}")
    print("\nPredictions saved to files:")
    print("- Detailed JSON report: text_analysis_results_[timestamp].json")
    print("- Predicted words CSV: predicted_words_[timestamp].csv")



Paragraph with masked words:
my name is parth . i am a boy . i love to [MASK] football and [MASK] 1 . my favourite team is [MASK] madrid in football and mercedes in [MASK] 1 . i play at a [MASK] [MASK] position in football . i love coding . i have built many projects . some of them are house marketplace and github finder [MASK] with [MASK] sentiment analysis and [MASK] prediction . i am learning [MASK] ai . i am searching for internships to work in during the summer holidays .

Is LLM generated: False
Accuracy: 0.00

Predictions saved to files:
- Detailed JSON report: text_analysis_results_[timestamp].json
- Predicted words CSV: predicted_words_[timestamp].csv
