https://huggingface.co/openai-community/gpt2

In [None]:
# Tokenization

from transformers import AutoTokenizer
# To use tokenizers, we import them from the transformers library

# There are many available, use the ID of the model you want to use
# Qwen "Qwen/Qwen2-0.5B"
# GPT-2 "openai-community/gpt2"
# SmolLM "HuggingFaceTB/SomlLM-135M"

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")

prompt = """One must have a mind of winter
To regard the frost and the boughs
Of the pine-trees crusted with snow;
And have been cold a long time
To behold the junipers shagged with ice,
The spruces rough in the distant glitter
Of the January sun; and not to think
Of any misery in the sound of the wind,
In the sound of a few leaves,
Which is the sound of the land
Full of the same wind
That is blowing in the same bare place
For the listener, who listens in the snow,
And, nothing himself, beholds
Nothing that is not there and the nothing that is."""

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [None]:
lines = prompt.split('\n')
modified_lines = []

for line in lines:
    words = line.split()
    if words:  # Ensure there are words to avoid index errors on empty lines
        modified_lines.append(' '.join(words[:-1]))
    else:
        modified_lines.append('') # Keep empty lines as empty

modified_prompt = '\n'.join(modified_lines)
print(modified_prompt)

One must have a mind of
To regard the frost and the
Of the pine-trees crusted with
And have been cold a long
To behold the junipers shagged with
The spruces rough in the distant
Of the January sun; and not to
Of any misery in the sound of the
In the sound of a few
Which is the sound of the
Full of the same
That is blowing in the same bare
For the listener, who listens in the
And, nothing himself,
Nothing that is not there and the nothing that


In [18]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")

num_words_predict = 50

predicted_next_sequences = [] # Renamed for clarity
for line in modified_lines:
    if line.strip(): # Only process non-empty lines
        inputs = tokenizer(line, return_tensors="pt")
        outputs = model.generate(inputs["input_ids"],
                                 max_new_tokens=1,
                                 num_beams=num_words_predict, # For beam search
                                 num_return_sequences=num_words_predict, # To get 7 distinct sequences
                                 pad_token_id=tokenizer.eos_token_id # Explicitly setting pad_token_id to silence warning
                                )
        # Decode each generated sequence
        decoded_options = []
        for output_sequence in outputs:
            # Decode the newly generated tokens, excluding the input
            generated_tokens = output_sequence[inputs["input_ids"].shape[-1]:]
            decoded_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
            decoded_options.append(decoded_text.strip())
        predicted_next_sequences.append(decoded_options)
    else:
        predicted_next_sequences.append(['']) # Keep empty lines in sync, store empty list for consistency

print("Predicted next sequences for each line:") # Updated print statement
for i, options in enumerate(predicted_next_sequences):
    if modified_lines[i].strip():
        print(f"Line {i+1}: {modified_lines[i].strip()} -->")
        for j, seq in enumerate(options):
            print(f"  Option {j+1}: {seq}")
    else:
        print(f"Line {i+1}: (empty line)")

Predicted next sequences for each line:
Line 1: One must have a mind of -->
  Option 1: their
  Option 2: its
  Option 3: his
  Option 4: your
  Option 5: a
  Option 6: our
  Option 7: her
  Option 8: this
  Option 9: my
  Option 10: one
  Option 11: the
  Option 12: balance
  Option 13: "
  Option 14: what
  Option 15: it
  Option 16: some
  Option 17: '
  Option 18: good
  Option 19: how
  Option 20: steel
  Option 21: something
  Option 22: self
  Option 23: order
  Option 24: thy
  Option 25: humility
  Option 26: gold
  Option 27: mystery
  Option 28: an
  Option 29: mine
  Option 30: that
  Option 31: purpose
  Option 32: these
  Option 33: deep
  Option 34: wonder
  Option 35: yours
  Option 36: justice
  Option 37: history
  Option 38: theirs
  Option 39: pure
  Option 40: consistency
  Option 41: right
  Option 42: great
  Option 43: unity
  Option 44: high
  Option 45: awe
  Option 46: humor
  Option 47: integrity
  Option 48: common
  Option 49: humour
  Option 50: power
Lin

In [19]:
combined_sentences = []
for i, line in enumerate(modified_lines):
    if line.strip() and predicted_next_sequences[i]:
        # Get the 7th predicted word (index 6)
        last_word = predicted_next_sequences[i][num_words_predict-1]
        combined_sentences.append(f"{line} {last_word}")
    elif line.strip(): # If line is not empty but no predictions were made (shouldn't happen with current logic, but for robustness)
        combined_sentences.append(line) # Keep the original line if no prediction
    else:
        combined_sentences.append('') # Keep empty lines as empty

print("Combined Sentences (Line + 7th Predicted Word):")
for i, sentence in enumerate(combined_sentences):
    if sentence.strip():
        print(f"Line {i+1}: {sentence}")
    else:
        print(f"Line {i+1}: (empty line)")

Combined Sentences (Line + 7th Predicted Word):
Line 1: One must have a mind of power
Line 2: To regard the frost and the danger
Line 3: Of the pine-trees crusted with sp
Line 4: And have been cold a long career
Line 5: To behold the junipers shagged with golden
Line 6: The spruces rough in the distant east
Line 7: Of the January sun; and not to set
Line 8: Of any misery in the sound of the whip
Line 9: In the sound of a few t
Line 10: Which is the sound of the first
Line 11: Full of the same !
Line 12: That is blowing in the same bare pockets
Line 13: For the listener, who listens in the presence
Line 14: And, nothing himself, with
Line 15: Nothing that is not there and the nothing that came


In [20]:
output_string = '\n'.join(combined_sentences)

# Open a file and write the combined sentences to it
with open('combined_sentences.txt', 'w') as f:
    f.write(output_string)

print("Combined sentences saved to 'combined_sentences.txt'")

Combined sentences saved to 'combined_sentences2.txt'
