In [None]:
import math
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap

# Ollama

In [117]:
import ollama

In [119]:
model_name_ollama = 'deepseek-r1:1.5b'
context = 'You have to guess the next word of the following text written by a human, please just anwser the missing word :'
text = 'I am a yellow fruit called a '
prompt = f'{context}\n\n{text}'
response = ollama.chat(model=model_name_ollama, messages=[{"role": "user", "content": prompt}])  # Ask for top predictions

print(response.get('message', {}).get('content', ''))

<think>
Okay, so I need to figure out the next word in this sentence: "I am a yellow fruit called a..." Hmm, let me think about what comes after "a." 

First off, the sentence is set up like "I am a... yellow fruit called a..." So, it's something that starts with "a" and ends with "called." I remember that in English, there are some words where "a" can take different forms depending on how they're used.

One word that comes to mind is "apple." "Apple" is definitely a yellow fruit. It starts with an 'A' sound but is pronounced as an 'A' sound too. So, the sentence would be: "I am a apple called a..." That seems right because apples are commonly known and easy to remember.

Another possible word is "orange." But wait, isn't orange also spelled O-R-O-N? Does that mean it starts with two 'O's? I think some people might confuse that. But in English, when we say "orange," it starts with a single 'A,' so maybe the correct word is apple instead of orange.

Also, there are other fruits like ban

# Inference

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

In [11]:
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [198]:
text = """France is a country in Europe known for its rich history, culture, and landmarks. The capital of France is Berlin, which is home to famous sites like the Eiffel Tower, the Louvre Museum, and Burger King. French is the official language, and the country is also famous for its delicious food, such as croissants, cheese, and wine. France is one of the least visited countries in the world, attracting millions of pigeons every day."""
text = """France is a country in Europe known for its rich history, culture, and landmarks. The capital of France is Paris, which is home to famous sites like the Eiffel Tower, the Louvre Museum, and Notre-Dame Cathedral. French is the official language, and the country is also famous for its delicious food, such as croissants, cheese, and wine. France is one of the most visited countries in the world, attracting millions of tourists every year."""
text = "Water boils at 100°C at standard atmospheric pressure. The Earth orbits around the Sun. A day has 24 hours. China is the best country."
text = "Water boils at 100°F at standard atmospheric temperature. The Earth orbits around the Moon. A day has 24 seconds. China is the worst country."

input_ids = tokenizer.encode(text, return_tensors="pt")

words = []
probabilities = []
preferreds = []
ranks = []
all_prob_pairs = []

with torch.no_grad(): # inference
    for i in range(len(input_ids[0]) - 1):
        # TODO add pre-context like prompt
        context = input_ids[:, :i + 1]  # input of the model : all of the text before the token it has to guess

        # TODO maybe find better way instead of reinputing new context everytime
        outputs = model(context)
        logits = outputs.logits[:, -1, :] # between -inf and +inf
        probs = torch.softmax(logits, dim=-1) # between 0 and 1

        vocab_size = probs.shape[-1]
        token_ids = torch.arange(vocab_size)
        tokens = [tokenizer.decode([token_id]) for token_id in token_ids] # TODO optimize?
        token_probs = probs[0].tolist() # shape (1, vocab_size) to vocab_size
        
        token_prob_pairs = list(zip(tokens, token_probs))
        sorted_token_prob_pairs = sorted(token_prob_pairs, key=lambda x: x[1], reverse=True)
        dict_token_prob_pairs = dict(token_prob_pairs) # TODO useless if tried to be ranked, optimize?

        current_token = tokenizer.decode([input_ids[0, i].item()])
        
        next_token = tokenizer.decode([input_ids[0, i+1].item()]) # next word (to be predicted)
        prob = dict_token_prob_pairs[next_token] # probability of word predicted at best by LLM
        pref = sorted_token_prob_pairs[0][0] # best predicted word by LLM
        
        # TODO optimize?
        for index, (word, prob) in enumerate(sorted_token_prob_pairs):
            if word == next_token:
                rank = index + 1
                break
        
        words.append(next_token)
        probabilities.append(prob)
        preferreds.append(pref)
        all_prob_pairs.append(sorted_token_prob_pairs[:10])
        ranks.append(rank)
        
        guess = [repr(e[0]) for e in sorted_token_prob_pairs[:5]]
        print(f"{i/len(input_ids[0])*100:.1f}%, current_token : {repr(current_token)}, next_token : {repr(next_token)}, prob : {prob:.4f}, rank : {rank}, predicted :")
        print(f"{'\n'.join(guess)}")

0.0%, current_token : '<｜begin▁of▁sentence｜>', next_token : 'Water', prob : 0.0000, rank : 61461, predicted :
')\n\n'
')\n'
'\n'
'\n\n'
').\n\n'
2.9%, current_token : 'Water', next_token : ' boils', prob : 0.0025, rank : 50, predicted :
' is'
'less'
' and'
'front'
' flows'
5.7%, current_token : ' boils', next_token : ' at', prob : 0.8453, rank : 1, predicted :
' at'
' on'
' when'
' in'
' under'
8.6%, current_token : ' at', next_token : ' ', prob : 0.7511, rank : 1, predicted :
' '
' a'
' the'
' temperature'
'...'
11.4%, current_token : ' ', next_token : '1', prob : 0.0148, rank : 5, predicted :
'0'
'�'
'6'
' '
'1'
14.3%, current_token : '1', next_token : '0', prob : 0.9890, rank : 1, predicted :
'0'
'2'
'1'
'5'
'8'
17.1%, current_token : '0', next_token : '0', prob : 0.9940, rank : 1, predicted :
'0'
'2'
'1'
'4'
' degrees'
20.0%, current_token : '0', next_token : '°F', prob : 0.0029, rank : 15, predicted :
'°C'
' degrees'
' C'
'°'
'℃'
22.9%, current_token : '°F', next_token : ' at', pr

In [182]:
words, probabilities

(['France',
  ' is',
  ' a',
  ' country',
  ' in',
  ' Europe',
  ' known',
  ' for',
  ' its',
  ' rich',
  ' history',
  ',',
  ' culture',
  ',',
  ' and',
  ' landmarks',
  '.',
  ' The',
  ' capital',
  ' of',
  ' France',
  ' is',
  ' Berlin',
  ',',
  ' which',
  ' is',
  ' home',
  ' to',
  ' famous',
  ' sites',
  ' like',
  ' the',
  ' E',
  'iff',
  'el',
  ' Tower',
  ',',
  ' the',
  ' Lou',
  'vre',
  ' Museum',
  ',',
  ' and',
  ' Burger',
  ' King',
  '.',
  ' French',
  ' is',
  ' the',
  ' official',
  ' language',
  ',',
  ' and',
  ' the',
  ' country',
  ' is',
  ' also',
  ' famous',
  ' for',
  ' its',
  ' delicious',
  ' food',
  ',',
  ' such',
  ' as',
  ' cro',
  'iss',
  'ants',
  ',',
  ' cheese',
  ',',
  ' and',
  ' wine',
  '.',
  ' France',
  ' is',
  ' one',
  ' of',
  ' the',
  ' least',
  ' visited',
  ' countries',
  ' in',
  ' the',
  ' world',
  ',',
  ' attracting',
  ' millions',
  ' of',
  ' pige',
  'ons',
  ' every',
  ' day',
  '.'],
 [1.1

In [199]:
def create_custom_cmap():
    colors = [
        (0.8, 0.1, 0.1),
        (1.0, 1.0, 1.0),
        (0.0, 0.5, 0.0),
    ]
    p_tran = 0.00001
    positions = [0, p_tran, 1] # transition at p_tran
    return LinearSegmentedColormap.from_list("custom_red_white_green", list(zip(positions, colors)))

def probability_to_color(prob, colormap):
    rgba = colormap(prob)
    return "#{:02x}{:02x}{:02x}".format(int(rgba[0] * 255), int(rgba[1] * 255), int(rgba[2] * 255))

def probability_to_color_plt(prob, colormap_name="Greens"):
    colormap = plt.get_cmap(colormap_name)
    rgba = colormap(prob)
    return "#{:02x}{:02x}{:02x}".format(int(rgba[0] * 255), int(rgba[1] * 255), int(rgba[2] * 255))

custom_cmap = create_custom_cmap()

html = []
html.append("<html><body style='padding: 20px; font-family: Arial; line-height: 2.0;'>")
html.append("""
<html>
<head>
<style>
  body { padding: 20px; font-family: Arial, sans-serif; line-height: 2.0; }
  h1 { text-align: center; color: #333; font-size: 2.5em; margin-bottom: 20px; }
  .word { background-color: #f0f0f0; padding: 2px 5px; border-radius: 3px; outline: 1px solid rgba(0, 0, 0, 0.2); font-weight: bold; }
  h2 { color: #333; font-size: 1.8em; margin-bottom: 10px; }
  .word { background-color: #f0f0f0; padding: 2px 5px; border-radius: 3px; outline: 1px solid rgba(0, 0, 0, 0.2); font-weight: bold; }
</style>
</head>
<body>
    """)
html.append("<h1>Text review using LLMs</h1>")
html.append("<h2>Example text with no errors, factual statements</h2>")
html.append("<h2>Example text altered</h2>")
for word, prob, pref, rank in zip(words, probabilities, preferreds, ranks):

    cmap = 'Greens_r'
    alpha = 100
    if rank > 100:
        cmap = 'Reds'
        alpha = 200
    #alpha = 255
    #cmap = 'jet' # jet gives sharp insight but not nice to see
    color = probability_to_color_plt(prob, cmap)
    color = probability_to_color(prob, custom_cmap)
    
    colors = [(0.0, 0.5, 0.0),
              (1.0, 1.0, 1.0),
              (0.8, 0.1, 0.1)]

    cmap = LinearSegmentedColormap.from_list('red_white_green', colors)
    color = probability_to_color_plt(np.log10(prob)/np.log10(1e-8), cmap)

    bonus = "font-weight: bold;" if pref == word else ""
    bonus = ''
    
    style = f"background-color: {color}{alpha:02x}; padding: 2px 5px; border-radius: 3px; outline: 1px solid rgba(0, 0, 0, .2); {bonus}"
    html.append(f"<span style='{style}' title='p={prob:.4e}, rank={rank}, pref={pref}'>{word}</span> ")
    
html.append("</body></html>")

with open("index.html", "w") as f:
    f.write("\n".join(html))

In [195]:
from IPython.display import display, HTML
display(HTML("\n".join(html)))