# Starter Code from Gabriel Vigliensoni
# https://colab.research.google.com/drive/1W5oVlyy2TmvF9gms6hSP8NDslPNtnGVI?usp=sharing

In [5]:
# Importing AI Framework
# ALWAYS START WITH THIS CODE!
# Tokenization

from transformers import AutoTokenizer
# To use tokenizers, we import them from the transformers library

# GPT-2 will be used for this assignment
# GPT-2 "openai-community/gpt2"

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
prompt = "storm stormy storminess storm-proof"
input_ids = tokenizer(prompt).input_ids
print(input_ids)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

[12135, 6388, 88, 6388, 1272, 6388, 12, 13288]


# Creating the P+7 Code
# Code from Google Gemini

In [6]:
poem = """One must have a mind of winter
To regard the frost and the boughs
Of the pine-trees crusted with snow;
And have been cold a long time
To behold the junipers shagged with ice,
The spruces rough in the distant glitter
Of the January sun; and not to think
Of any misery in the sound of the wind,
In the sound of a few leaves,
Which is the sound of the land
Full of the same wind
That is blowing in the same bare place
For the listener, who listens in the snow,
And, nothing himself, beholds
Nothing that is not there and the nothing that is."""

# Split the poem into words
words = poem.split()

# Get the last word
last_word = words[-1]

print(f"The poem is:\n{poem}\n")
print(f"The last word of the poem is: '{last_word}'")

The poem is:
One must have a mind of winter
To regard the frost and the boughs
Of the pine-trees crusted with snow;
And have been cold a long time
To behold the junipers shagged with ice,
The spruces rough in the distant glitter
Of the January sun; and not to think
Of any misery in the sound of the wind,
In the sound of a few leaves,
Which is the sound of the land
Full of the same wind
That is blowing in the same bare place
For the listener, who listens in the snow,
And, nothing himself, beholds
Nothing that is not there and the nothing that is.

The last word of the poem is: 'is.'


In [9]:
# Split the poem into lines
lines = poem.split('\n')

print("Last word of each line:")
for i, line in enumerate(lines):
    # Remove leading/trailing whitespace from the line
    stripped_line = line.strip()
    if stripped_line:
        # Split the stripped line into words
        words_in_line = stripped_line.split()
        if words_in_line:
            last_word_of_line = words_in_line[-1]
            print(f"Line {i+1}: '{last_word_of_line}'")
        else:
            print(f"Line {i+1}: (empty line)")
    else:
        print(f"Line {i+1}: (empty line)")

Last word of each line:
Line 1: 'winter'
Line 2: 'boughs'
Line 3: 'snow;'
Line 4: 'time'
Line 5: 'ice,'
Line 6: 'glitter'
Line 7: 'think'
Line 8: 'wind,'
Line 9: 'leaves,'
Line 10: 'land'
Line 11: 'wind'
Line 12: 'place'
Line 13: 'snow,'
Line 14: 'beholds'
Line 15: 'is.'


# Task
Load the pre-trained GPT-2 model (`AutoModelForCausalLM`) and define a Python function to predict the Nth next word for a given text using this model. Then, iterate through each line of the provided poem, replacing its last word with the 7th predicted word by GPT-2 based on the preceding text of that line. Finally, display the entire modified poem.

## Load GPT-2 Model

### Subtask:
Load the pre-trained GPT-2 model (`AutoModelForCausalLM`) from 'openai-community/gpt2' to enable next-word prediction.


**Reasoning**:
To load the GPT-2 model, I need to import the `AutoModelForCausalLM` class and then use its `from_pretrained` method to initialize the model.



In [10]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
print("GPT-2 model loaded successfully.")

GPT-2 model loaded successfully.


## Define Prediction Function

### Subtask:
Create a Python function that takes a text string (representing a line of the poem without its last word) and an integer `n` (for the Nth word). This function will tokenize the input, pass it through the GPT-2 model, calculate probabilities for the next words, sort them, and return the `n`-th predicted word. This function will be used to get the 7th predicted word for each line.


**Reasoning**:
I need to define the `predict_nth_word` function as described in the subtask. This function will involve tokenizing text, getting model predictions, calculating probabilities, and decoding the Nth most probable word. I will import `torch` and `torch.nn.functional` for tensor operations and softmax calculation.



In [11]:
import torch
import torch.nn.functional as F

def predict_nth_word(text, n=7, k=20):
    # 2. Tokenize the input text
    # Add return_tensors='pt' to get PyTorch tensors directly
    input_ids = tokenizer.encode(text, return_tensors='pt')

    # 3. Pass the tokenized input to the pre-loaded model
    with torch.no_grad(): # Disable gradient calculation for inference
        outputs = model(input_ids)

    # 4. Extract the logits for the last token in the input sequence
    last_token_logits = outputs.logits[0, -1, :]

    # 5. Convert these logits to probabilities using a softmax function
    probabilities = F.softmax(last_token_logits, dim=-1);

    # 6. Get the top k probable next tokens and their corresponding probabilities
    # `k` is set to 20 to ensure we can find the 7th word reliably
    top_k_probabilities, top_k_token_ids = torch.topk(probabilities, k=k)

    # 7. Decode these top k token IDs back into words
    # The tokenizer might add a space prefix, so we strip it.
    predicted_words_with_prob = []
    for token_id, prob in zip(top_k_token_ids, top_k_probabilities):
        word = tokenizer.decode(token_id).strip()
        predicted_words_with_prob.append((word, prob.item()))

    # 8. Sort the predicted words by their probabilities in descending order
    # The `torch.topk` already returns them sorted, but this step explicitly sorts the decoded words for clarity.
    predicted_words_with_prob.sort(key=lambda x: x[1], reverse=True)

    # Extract only the words from the sorted list
    sorted_words = [word for word, prob in predicted_words_with_prob]

    # 9. Return the n-th word from this sorted list (adjusting for 0-based indexing)
    # Ensure n is within the bounds of the sorted_words list
    if n > 0 and n <= len(sorted_words):
        return sorted_words[n - 1]
    else:
        return "" # Return an empty string or handle error if n is out of bounds

print("The 'predict_nth_word' function has been defined.")

The 'predict_nth_word' function has been defined.


**Reasoning**:
Now that the `predict_nth_word` function is defined, I need to iterate through each line of the poem, identify the text preceding the last word, use the prediction function to get the 7th most probable next word, and then reconstruct the line with the predicted word. Finally, I will join all modified lines to display the complete modified poem.



In [12]:
modified_poem_lines = []
for line_index, line in enumerate(lines):
    stripped_line = line.strip()
    if not stripped_line:
        modified_poem_lines.append("") # Append empty line if original was empty
        continue

    words_in_line = stripped_line.split() # Split the stripped line into words

    if not words_in_line:
        # This case should ideally not be reached if stripped_line is not empty,
        # but included for robustness.
        modified_poem_lines.append(stripped_line)
        continue

    # Extract the preceding text by joining all words except the last one.
    # If there's only one word in the line, preceding_text will be an empty string.
    preceding_text = " ".join(words_in_line[:-1])

    # Predict the 7th next word using the defined function
    predicted_word = predict_nth_word(preceding_text, n=7)

    # Reconstruct the line with the predicted word replacing the original last word.
    # Handle the case where the original line had only one word (preceding_text was empty).
    if preceding_text:
        new_line = preceding_text + " " + predicted_word
    else:
        new_line = predicted_word

    modified_poem_lines.append(new_line)

# Join all the modified lines back into a single string to form the final modified poem
final_modified_poem = "\n".join(modified_poem_lines)

print("Original Poem:\n")
print(poem)
print("\n" + "="*30 + "\n")
print("Modified Poem (last words replaced with 7th predicted word):\n")
print(final_modified_poem)

Original Poem:

One must have a mind of winter
To regard the frost and the boughs
Of the pine-trees crusted with snow;
And have been cold a long time
To behold the junipers shagged with ice,
The spruces rough in the distant glitter
Of the January sun; and not to think
Of any misery in the sound of the wind,
In the sound of a few leaves,
Which is the sound of the land
Full of the same wind
That is blowing in the same bare place
For the listener, who listens in the snow,
And, nothing himself, beholds
Nothing that is not there and the nothing that is.


Modified Poem (last words replaced with 7th predicted word):

One must have a mind of her
To regard the frost and the death
Of the pine-trees crusted with oil
And have been cold a long way
To behold the junipers shagged with white
The spruces rough in the distant horizon
Of the January sun; and not to have
Of any misery in the sound of the sound
In the sound of a few shots
Which is the sound of the voice
Full of the same day
That is blowin

## Final Task

### Subtask:
Summarize the process of replacing the last words with GPT-2's 7th predicted words and present the final modified poem.


## Summary:

### Data Analysis Key Findings

*   The GPT-2 model (`AutoModelForCausalLM` from 'openai-community/gpt2') was successfully loaded for next-word prediction.
*   A custom Python function, `predict_nth_word`, was defined to tokenize input text, pass it through the GPT-2 model, extract logits, calculate probabilities, and identify the `n`-th most probable next word from the top `k` predictions (defaulting to `n=7` and `k=20`).
*   The function correctly handled the extraction and decoding of words from token IDs, ensuring words were sorted by probability.
*   Each line of the provided poem was processed by:
    *   Splitting the line into words.
    *   Identifying the text preceding the last word.
    *   Using the `predict_nth_word` function to obtain the 7th predicted word based on the preceding text.
    *   Replacing the original last word with the GPT-2's 7th predicted word.
*   Edge cases, such as empty lines or lines containing only a single word, were considered during the poem modification process to ensure robust handling.
*   The final modified poem was successfully generated and displayed, showcasing the replacements.

### Insights or Next Steps

*   This approach effectively demonstrates the application of a pre-trained language model for targeted text generation and creative content alteration, highlighting its capability to predict contextually relevant words.
*   Further exploration could involve experimenting with different values for `n` (e.g., the 1st or 3rd predicted word) to observe variations in the modified poem's style, or integrating other language models to compare their predictive outputs.


# Modifying the P+7 to P+30
# Code from Google Gemini

In [13]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
print("GPT-2 model loaded successfully.")

GPT-2 model loaded successfully.


In [26]:
import torch
import torch.nn.functional as F

def predict_nth_word(text, n=64, k=74):
    # 2. Tokenize the input text
    # Add return_tensors='pt' to get PyTorch tensors directly
    input_ids = tokenizer.encode(text, return_tensors='pt')

    # 3. Pass the tokenized input to the pre-loaded model
    with torch.no_grad(): # Disable gradient calculation for inference
        outputs = model(input_ids)

    # 4. Extract the logits for the last token in the input sequence
    last_token_logits = outputs.logits[0, -1, :]

    # 5. Convert these logits to probabilities using a softmax function
    probabilities = F.softmax(last_token_logits, dim=-1);

    # 6. Get the top k probable next tokens and their corresponding probabilities
    # `k` is set to 74 to ensure we can find the 64th word reliably
    top_k_probabilities, top_k_token_ids = torch.topk(probabilities, k=k)

    # 7. Decode these top k token IDs back into words
    # The tokenizer might add a space prefix, so we strip it.
    predicted_words_with_prob = []
    for token_id, prob in zip(top_k_token_ids, top_k_probabilities):
        word = tokenizer.decode(token_id).strip()
        predicted_words_with_prob.append((word, prob.item()))

    # 8. Sort the predicted words by their probabilities in descending order
    # The `torch.topk` already returns them sorted, but this step explicitly sorts the decoded words for clarity.
    predicted_words_with_prob.sort(key=lambda x: x[1], reverse=True)

    # Extract only the words from the sorted list
    sorted_words = [word for word, prob in predicted_words_with_prob]

    # 9. Return the n-th word from this sorted list (adjusting for 0-based indexing)
    # Ensure n is within the bounds of the sorted_words list
    if n > 0 and n <= len(sorted_words):
        return sorted_words[n - 1]
    else:
        return "" # Return an empty string or handle error if n is out of bounds

print("The 'predict_nth_word' function has been defined.")

The 'predict_nth_word' function has been defined.


In [27]:
modified_poem_lines = []
for line_index, line in enumerate(lines):
    stripped_line = line.strip()
    if not stripped_line:
        modified_poem_lines.append("") # Append empty line if original was empty
        continue

    words_in_line = stripped_line.split() # Split the stripped line into words

    if not words_in_line:
        # This case should ideally not be reached if stripped_line is not empty,
        # but included for robustness.
        modified_poem_lines.append(stripped_line)
        continue

    # Extract the preceding text by joining all words except the last one.
    # If there's only one word in the line, preceding_text will be an empty string.
    preceding_text = " ".join(words_in_line[:-1])

    # Predict the 64th next word using the defined function
    predicted_word = predict_nth_word(preceding_text, n=64)

    # Reconstruct the line with the predicted word replacing the original last word.
    # Handle the case where the original line had only one word (preceding_text was empty).
    if preceding_text:
        new_line = preceding_text + " " + predicted_word
    else:
        new_line = predicted_word

    modified_poem_lines.append(new_line)

# Join all the modified lines back into a single string to form the final modified poem
final_modified_poem = "\n".join(modified_poem_lines)

print("Original Poem:\n")
print(poem)
print("\n" + "="*30 + "\n")
print("Modified Poem (last words replaced with 64th predicted word):\n")
print(final_modified_poem)

Original Poem:

One must have a mind of winter
To regard the frost and the boughs
Of the pine-trees crusted with snow;
And have been cold a long time
To behold the junipers shagged with ice,
The spruces rough in the distant glitter
Of the January sun; and not to think
Of any misery in the sound of the wind,
In the sound of a few leaves,
Which is the sound of the land
Full of the same wind
That is blowing in the same bare place
For the listener, who listens in the snow,
And, nothing himself, beholds
Nothing that is not there and the nothing that is.


Modified Poem (last words replaced with 64th predicted word):

One must have a mind of language
To regard the frost and the bl
Of the pine-trees crusted with bark
And have been cold a long ago
To behold the junipers shagged with sh
The spruces rough in the distant landscape
Of the January sun; and not to an
Of any misery in the sound of the two
In the sound of a few breaths
Which is the sound of the building
Full of the same ."
That is blo