In [1]:
import markovify
import numpy as np

In [2]:
with open("arxivData.txt") as f:
    text = f.read()

In [3]:
text[:100]

'Title: Dual Recurrent Attention Units for Visual Question Answering\nSummary: We propose an architect'

In [16]:
def gen_sentences(n):
    model = markovify.Text(text, state_size=n)
    sentences = [model.make_sentence(tries=1000) for _ in range(5)]
    return sentences

def print_sentences(sentences):
    for i, s in enumerate(sentences):
        print("Sentence", i+1)
        print(s)
        print('-'*10)



In [17]:
sen1 = gen_sentences(1)
print_sentences(sen1)

Sentence 1
In addition, it is to zero mean algorithms, to be.
----------
Sentence 2
Thereafter, UML class of learning.
----------
Sentence 3
Simulation results was tested in a semi-supervised setting: DeCA.
----------
Sentence 4
We further improvement this paper as Neuro-Fuzzy Techniques for noise by the era of the simplification is to accelerate ML estimator performs particularly important during manipulation.
----------
Sentence 5
The non-finite-state copying aspect of training data representation languages.
----------


In [18]:
sen2 = gen_sentences(2)
print_sentences(sen2)

Sentence 1
Among other things, with these two popular classifiers are usually significantly lower.
----------
Sentence 2
A smaller version of the proposed algorithm to deduce the underlying algorithm is experimentally verified.
----------
Sentence 3
Title: Anytime Induction of Word Embeddings By Word Sense Disambiguation Summary: This document describes the new model layer, OpenMax, which estimates the appropriate lexical usage we combine the inherent drawback that the discrete-continuous models outperform each single task.
----------
Sentence 4
Furthermore, we show that our technique for clustering data which is related to a frequently used indicator of SAT 2011, Glucose improved by optimizing the target dataset is feeded with secondary voltages to the inherent uncertainty and vagueness will yield a robust modification of all astronomical archives available worldwide, as well as automatically illustrating recipes with keyframes, and searching are implemented.
----------
Sentence 5
The

In [19]:
sen3 = gen_sentences(3)
print_sentences(sen3)

Sentence 1
Among other results, we show that the difference between the probability distributions are not known beforehand, or so easily delineated, however.
----------
Sentence 2
Genotypes are strings of tokens of a given set of music clips.
----------
Sentence 3
When working with massive data, we show how to use and adapt PonyGE2 have been developed.
----------
Sentence 4
Finally, we evaluate our conversion process by using the learned hidden sentence representations.
----------
Sentence 5
Title: Multi-Instance Visual-Semantic Embedding Summary: Visual-semantic embedding models have been found, including caching, higher-order n-grams, skipping, interpolated Kneser-Ney smoothing, and clustering.
----------


In [22]:
sen5 = gen_sentences(5)
print_sentences(sen5)

Sentence 1
Third, to reduce the synchronization cost, we terminate the process of finding an good ESN for a specific dataset quite hard.
----------
Sentence 2
The proposed algorithms are amenable to parallelization, scale linearly in the size of output space in the worst-case.
----------
Sentence 3
By open-sourcing our framework, we hope to stimulate progress in the field of semantic segmentation and object detection.
----------
Sentence 4
Title: Kernels for sequentially ordered data Summary: We present a nonparametric Bayesian method for exploratory data analysis and feature construction in continuous time series.
----------
Sentence 5
An extensive empirical analysis shows that the proposed method is able to provide comparative performance for audio emotion recognition.
----------


In [20]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the model and tokenizer
model_name = "distilgpt2"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define the input sentence
sentence = "This is an example sentence."

# Tokenize the input sentence
input_ids = tokenizer.encode(sentence, return_tensors="pt")

# Pass the tokenized input to the model and compute the loss
with torch.no_grad():
    outputs = model(input_ids, labels=input_ids)
    loss = outputs.loss

# Calculate the perplexity
perplexity = torch.exp(loss).item()
print(f"Perplexity: {perplexity}")

Perplexity: 147.5555877685547


In [21]:


def average_perplexity(sentences):
    total = 0
    for s in sentences:
        input_ids = tokenizer.encode(s, return_tensors="pt")
        with torch.no_grad():
            outputs = model(input_ids, labels=input_ids)
            loss = outputs.loss
        perplexity = torch.exp(loss).item()
        total += perplexity
    return total/len(sentences)

print("Average perplexity for 1-gram sentences:", average_perplexity(sen1))
print("Average perplexity for 2-gram sentences:", average_perplexity(sen2))
print("Average perplexity for 3-gram sentences:", average_perplexity(sen3))

Average perplexity for 1-gram sentences: 599.1120056152344
Average perplexity for 2-gram sentences: 279.3132843017578
Average perplexity for 3-gram sentences: 226.4117202758789


In [23]:
print("Average perplexity for 5-gram sentences:", average_perplexity(sen5))

Average perplexity for 5-gram sentences: 138.5129737854004
