<a href="https://colab.research.google.com/github/Savoxism/AI-Text-Detection-Model/blob/main/compute_perplexity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install huggingface_hub
!pip install safetensors
!pip install tokenizers sentencepiece sacremoses importlib_metadata

In [None]:
# Import libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

# Import the AutoModelWithLMHead class
from transformers import AutoModelForCausalLM

# Define the model and tokenizer
model = AutoModelForCausalLM.from_pretrained('gpt2')
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'gpt2')

# Define the texts
generated_text = "The sun was shining brightly in the sky. It was a beautiful day to go for a walk. I put on my shoes and grabbed my phone. I decided to listen to some music while I walked. I opened Spotify and searched for my favorite playlist. I pressed play and put on my headphones. I felt a surge of happiness as the first song started playing. It was one of my favorite songs ever. I smiled and started walking."
human_text = "I love sunny days. They make me feel so happy and energetic. I decided to take advantage of the nice weather and go for a walk. I put on my sneakers and took my phone with me. I wanted to listen to some music while I enjoyed the fresh air. I opened Spotify and looked for a playlist that matched my mood. I found one that had a lot of upbeat songs. I hit play and put on my earbuds. The first song that came on was perfect. It was one of those songs that always makes me smile. I started walking with a spring in my step."

# Define a function to calculate perplexity
def perplexity(text, model, tokenizer):
  # Encode the text
  input_ids = tokenizer.encode(text, return_tensors="pt")
  # Get the logits from the model
  logits = model(input_ids).logits
  # Get the probabilities from the logits using softmax
  probs = F.softmax(logits, dim=-1)
  # Get the log probabilities from the probabilities using log_softmax
  log_probs = F.log_softmax(logits, dim=-1)
  # Multiply the probabilities and the log probabilities element-wise
  p_log_p = probs * log_probs
  # Sum up the p_log_p values along the last dimension
  p_log_p_sum = torch.sum(p_log_p, dim=-1)
  # Negate the p_log_p_sum values
  neg_p_log_p_sum = -p_log_p_sum
  # Take the mean of the neg_p_log_p_sum values
  mean_neg_p_log_p_sum = torch.mean(neg_p_log_p_sum)
  # Exponentiate the mean_neg_p_log_p_sum value to get the perplexity
  perplexity = math.exp(mean_neg_p_log_p_sum)
  # Return the perplexity
  return perplexity

# Calculate the perplexities of the texts
generated_text_perplexity = perplexity(generated_text, model, tokenizer)
human_text_perplexity = perplexity(human_text, model, tokenizer)

# Print the results
print(f"Generated text perplexity: {generated_text_perplexity}")
print(f"Human text perplexity: {human_text_perplexity}")

In [None]:
input_text = "In addition, in 2015, the number of total visitors came to approximately about 2.75 million and the number of people who chose not to go on the island exceeded their island counterparts. In the following years, there was not a significant change in the figure of total visitors; however, the number of people who chose to stay on ships was slightly higher than on the island with the number of 1.5 million. Additionally, both figures saw a sharp increase to 2 million and 1.5 million which are also the highest rate of the two numbers in the period."

In [None]:
input_text_perplexity = perplexity(input_text, model, tokenizer)

# Print the result
print(f"Input text perplexity: {input_text_perplexity}")