In [1]:
import numpy as np

# Çıktı tutarlılığı için seed ayarı
np.random.seed(42)

# --- Parametreler ve Embeddingler ---
SEQ_LEN = 5
D_MODEL = 4
E_embeddings = np.array([
    [0.5, 0.2, -0.3, 0.8], 
    [0.1, 0.6, 0.7, -0.2], 
    [-0.4, 0.3, 0.9, 0.5], 
    [0.9, -0.1, 0.2, 0.6], 
    [0.2, 0.7, -0.5, 0.4]  
])

# --- RFF PE Fonksiyonu ---
def rff_pe_numpy(seq_len, d_model, scale=0.5):
    # W: Rastgele Frekans (d_model, 1)
    W = np.random.normal(loc=0.0, scale=scale, size=(d_model, 1))
    # b: Rastgele Faz Kayması (d_model, 1)
    b = np.random.uniform(low=0.0, high=2 * np.pi, size=(d_model, 1))
    
    t_positions = np.arange(1, seq_len + 1)
    
    # Argüman hesabı: W * t + b
    arg = W @ t_positions[None, :] + b 
    
    # Kosinüs uygulanır: P_t
    P_t = np.cos(arg).T 
    
    return P_t

# --- Uygulama ---
P_rff = rff_pe_numpy(SEQ_LEN, D_MODEL)
E_final_rff = E_embeddings + P_rff

# Nihai Vektörlerin Kodu
print("\n--- Nihai Konumsal Kodlanmış Embeddingler ---")
for i, kelime in enumerate(["Doğal", "dil", "işleme", "ödevini", "yaptım."]):
    print(f"{kelime}: {E_final_rff[i].round(decimals=3)}")



--- Nihai Konumsal Kodlanmış Embeddingler ---
Doğal: [0.836 0.813 0.472 1.797]
dil: [0.194 1.266 1.23  0.576]
işleme: [-0.554  1.016  1.132  0.627]
ödevini: [0.508 0.663 0.111 0.007]
yaptım.: [-0.406  1.505 -0.902 -0.585]


In [3]:
import torch
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"
batch.append(torch.tensor(
tokenizer.encode(txt1)))
batch.append(torch.tensor(
tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)

In [4]:
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [10]:
texts = ["I didn't owe you anything",
         "I don't want to rush life and work"]

tokenized = [tokenizer.encode(t) for t in texts]
print(tokenized)


[[40, 1422, 470, 19059, 345, 1997], [40, 836, 470, 765, 284, 10484, 1204, 290, 670]]


In [11]:
max_len = max(len(t) for t in tokenized)

In [12]:
padded = [seq + [0] * (max_len - len(seq)) for seq in tokenized]
print(padded)

[[40, 1422, 470, 19059, 345, 1997, 0, 0, 0], [40, 836, 470, 765, 284, 10484, 1204, 290, 670]]
