In [9]:
import pandas as pd

df = pd.read_csv('rsa_primes.csv')

with open('fine_tune.txt', 'w') as f:
    for _, row in df.iterrows():
        line = f"n: {row['n']} -> p: {row['p']} q: {row['q']}\n"
        f.write(line)


In [16]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Load tokenizer and model from local path
model_path = "models/gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Load and tokenize dataset
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="fine_tune.txt",
    block_size=512,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=1,
    save_steps=500,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

trainer.train()
model.save_pretrained("models/fine_tuned_gpt2")
tokenizer.save_pretrained("models/fine_tuned_gpt2")





`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
500,2.3662
1000,2.2568


('models/fine_tuned_gpt2\\tokenizer_config.json',
 'models/fine_tuned_gpt2\\special_tokens_map.json',
 'models/fine_tuned_gpt2\\vocab.json',
 'models/fine_tuned_gpt2\\merges.txt',
 'models/fine_tuned_gpt2\\added_tokens.json')

In [None]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from sympy import isprime
import torch
from tqdm import tqdm

test_df = pd.read_csv("rsa_primes_test.csv")
# Load fine-tuned model
model_path = "models/fine_tuned_gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)
model.eval()

def extract_digits(text):
    return ''.join(filter(str.isdigit, text))

def predict_factors(n_str, max_attempts=3):
    prompt = f"n: {n_str} ->"
    input_ids = tokenizer.encode(prompt, return_tensors='pt')

    for _ in range(max_attempts):
        with torch.no_grad():
            outputs = model.generate(
                input_ids,
                max_length=300,
                num_return_sequences=1,
                temperature=0.9,
                do_sample=True,
                top_k=50,
                pad_token_id=tokenizer.eos_token_id
            )
        decoded = tokenizer.decode(outputs[0])
        try:
            p_raw = decoded.split("p:")[1].split("q:")[0].strip()
            q_raw = decoded.split("q:")[1].strip()

            p_val = int(extract_digits(p_raw))
            q_val = int(extract_digits(q_raw))

            return p_val, q_val
        except:
            continue
    return None, None


correct = 0
total = 0

for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    actual_p = int(str(row['p'])[1:])  # remove 'a' prefix
    actual_q = int(str(row['q'])[1:])
    n = str(row['n'])[1:]

    pred_p, pred_q = predict_factors(n)

    if pred_p is not None and pred_q is not None:
        if {pred_p, pred_q} == {actual_p, actual_q}:
            correct += 1
    total += 1

accuracy = correct / total * 100
print(f"\n✅ Model Accuracy: {accuracy:.2f}% on {total} test cases")



 16%|████████████▍                                                                | 324/2000 [46:09<4:01:47,  8.66s/it]

In [17]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
from sympy import isprime

model_path = "models/fine_tuned_gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

model.eval()

from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
from sympy import isprime

model_path = "models/fine_tuned_gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

model.eval()

def predict_factors(n_str, max_attempts=3):
    prompt = f"n: {n_str} ->"
    input_ids = tokenizer.encode(prompt, return_tensors='pt')

    for _ in range(max_attempts):
        with torch.no_grad():
            outputs = model.generate(
                input_ids,
                max_length=300,
                num_return_sequences=1,
                temperature=0.9,
                do_sample=True,
                top_k=50
            )

        decoded = tokenizer.decode(outputs[0])
        print(f"\n🔍 Output: {decoded}")

        # Try to extract p and q
        if "p:" in decoded and "q:" in decoded:
            try:
                p_str = decoded.split("p:")[1].split("q:")[0].strip()
                q_str = decoded.split("q:")[1].split("\n")[0].strip()
                # if isprime(int(p_str[1:])) and isprime(int(q_str[1:])):
                #     return p_str, q_str
                return p_str[1:], q_str[1:]
            except:
                continue

    return None, None

# Example usage

n = "a3351951982485649274893506249551461531869841455148098344431851084682782361479386449415440197611684937280096407359711927382771525195526408454081639758934933"
p_pred, q_pred = predict_factors(n)
print(f"\n✅ Predicted primes:\np = {p_pred}\nq = {q_pred}")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



🔍 Output: n: a26479028122147196027 -> p: a57896044618658097711785492504343953926634992332820282019729895530604748151885703 q: a578960446186580977117854925043439539266349923328202820197345642750367560452079
n: a3351951982485649274893506249551461531869841455148098344431889089901641256272698008869603514787823233411898829772575676905146517234063295490989794828535099 -> p: a5789604461865809771178549250434395392663499233282028201973378581667289738963 q: a57896044618658097711785492504343953926634992332820282019740669816772938853907
n: a33519519824856492748935062495514615318698414551480983444318791609551216058855603918151466459914281712448898127959257845139833

✅ Predicted primes:
p = 57896044618658097711785492504343953926634992332820282019729895530604748151885703
q = 578960446186580977117854925043439539266349923328202820197345642750367560452079


In [6]:
# Given RSA public key components
p = 57896044618658097711785492504343953926634992332820282019739415617116836954601
q = 57896044618658097711785492504343953926634992332820282019734762335316498173133
n = 3351951982485649274893506249551461531869841455148098344431851084682782361479386449415440197611684937280096407359711927382771525195526408454081639758934933
e = 65537

# Message to encrypt
message = "HELLO"

# Convert message to numbers (A=1 to Z=26), zero-padded to 2 digits each
number_str = ''.join([f"{ord(char)-64:02d}" for char in message.upper()])
m = int(number_str)

print("Plaintext number:", m)

# Encrypt using RSA: c = m^e mod n
c = pow(m, e, n)

print("Encrypted ciphertext:", c)


Plaintext number: 805121215
Encrypted ciphertext: 1265486306274259077781284030375601665475060877223086162653019628051397314338821524969155356865578468897576975544391903911393398918764660404040400316337347


In [11]:
int(q_pred)

5789604461865809771178549250434395392663499233282028201973406588016424694789

In [15]:
from sympy import mod_inverse

# Given RSA components
p = int(p_pred)
q = int(q_pred)
# p = 57896044618658097711785492504343953926634992332820282019739415617116836954601
# q = 57896044618658097711785492504343953926634992332820282019734762335316498173133
n = p * q
e = 65537

print("ciphertext:", c)

# 1. Compute phi(n)
phi_n = (p - 1) * (q - 1)

# 2. Compute private key d
d = mod_inverse(e, phi_n)

# 3. Decrypt: m = c^d mod n
m = pow(c, d, n)

print("Decrypted numeric message:", m)

# 4. Convert numeric message into alphabetic string using mod 26
m_str = str(m)
if len(m_str) % 2 != 0:
    m_str = '0' + m_str

# Split into 2-digit chunks
decoded_chars = []
for i in range(0, len(m_str), 2):
    val = int(m_str[i:i+2]) % 26
    if val == 0:
        val = 26
    decoded_chars.append(chr(val + 64))  # 1->A, 2->B, ..., 26->Z

decoded_message = ''.join(decoded_chars)
print("Decrypted text message:", decoded_message)


ciphertext: 1265486306274259077781284030375601665475060877223086162653019628051397314338821524969155356865578468897576975544391903911393398918764660404040400316337347
Decrypted numeric message: 2459704585754303232621562479658355849468497433466205433760616098786060803207868904198188180366020197512663334662778258494037023157535533155655952152652
Decrypted text message: BSSDFEBDFFJOJURFIFWTFSQHNTBGXFPIIHFHCTZPLOTRCBJHTSWLNGHNAZYFPCRWOWIAECMGZOZZ
