In [14]:
from transformers import TrainingArguments, Trainer, AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling
from datasets import load_dataset

# Paths for Kaggle
TRAIN_FILE = "/kaggle/input/password-dataset/password_dataset.json"
MODEL_OUTPUT_DIR = "/kaggle/working/password_model"

# Load dataset
dataset = load_dataset("json", data_files={"train": TRAIN_FILE})["train"]

# Use a smaller GPT-2 model
MODEL_NAME = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token  # ✅ Fix padding issue

# Load model
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

# Tokenization function (Correct for Causal LM)
def tokenize_function(examples):
    input_encodings = tokenizer(examples["weak"], truncation=True, padding="max_length", max_length=32)
    output_encodings = tokenizer(examples["strong"], truncation=True, padding="max_length", max_length=32)

    return {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": input_encodings["attention_mask"],
        "labels": output_encodings["input_ids"],  # ✅ Properly map strong passwords as labels
    }

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["weak", "strong"])

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

# Training configuration
training_args = TrainingArguments(
    output_dir=MODEL_OUTPUT_DIR,
    per_device_train_batch_size=16,  
    gradient_accumulation_steps=1,
    num_train_epochs=3,
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=1,
    logging_dir="/kaggle/working/logs",
    logging_steps=100,
    fp16=True,  
    dataloader_num_workers=2,
    optim="adamw_torch",
    remove_unused_columns=False,
    report_to="none",
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train model
trainer.train()

# Save model
trainer.save_model(MODEL_OUTPUT_DIR)
print(f"✅ Model saved to {MODEL_OUTPUT_DIR}")


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/99964 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
100,4.2433
200,1.2043
300,0.513
400,0.3739
500,0.3491
600,0.3329
700,0.3009
800,0.2905
900,0.2853
1000,0.2847




✅ Model saved to /kaggle/working/password_model


In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset

# Paths for Kaggle
TRAIN_FILE = "/kaggle/input/password-dataset/password_dataset.json"
MODEL_OUTPUT_DIR = "/kaggle/working/password_model"

# Load dataset using pandas
df = pd.read_json(TRAIN_FILE)

# Convert dataset into a format GPT-2 can learn from
df["text"] = df["weak"] + " -> " + df["strong"]
dataset = Dataset.from_pandas(df[["text"]])  # Convert to Hugging Face Dataset

# Load model and tokenizer
MODEL_NAME = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token  # Fix padding issues
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

# Tokenization function with labels
def tokenize_function(examples):
    encodings = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=64)
    
    # Shift labels to predict next token
    encodings["labels"] = encodings["input_ids"].copy()
    return encodings

# Tokenize dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Training setup
training_args = TrainingArguments(
    output_dir=MODEL_OUTPUT_DIR,
    per_device_train_batch_size=16,  
    num_train_epochs=5,
    save_strategy="epoch",
    logging_dir="/kaggle/working/logs",
    logging_steps=100,
    fp16=True,  
    optim="adamw_torch",
    report_to="none",
)

# Train model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

trainer.train()

# Save model
trainer.save_model(MODEL_OUTPUT_DIR)
print(f"✅ Model saved to {MODEL_OUTPUT_DIR}")


Map:   0%|          | 0/99964 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
100,0.9879
200,0.5966
300,0.4971
400,0.4439
500,0.4112
600,0.3892
700,0.3722
800,0.362
900,0.3594
1000,0.3492




✅ Model saved to /kaggle/working/password_model


In [14]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load trained model and tokenizer
MODEL_OUTPUT_DIR = "/kaggle/working/password_model"
model = AutoModelForCausalLM.from_pretrained(MODEL_OUTPUT_DIR)
tokenizer = AutoTokenizer.from_pretrained(MODEL_OUTPUT_DIR)

# Ensure PAD token is set
tokenizer.pad_token = tokenizer.eos_token

# Function to generate multiple strong passwords
def generate_strong_passwords(weak_password, min_length=12, num_variations=3):
    input_ids = tokenizer(weak_password, return_tensors="pt").input_ids
    strong_passwords = []

    while len(strong_passwords) < num_variations:
        output_ids = model.generate(
            input_ids, 
            max_length=32,  # Prevent overly long outputs
            do_sample=True,  
            top_k=50,  
            top_p=0.95,  
            temperature=0.7,  
            repetition_penalty=1.2,  
            pad_token_id=tokenizer.eos_token_id  
        )

        generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        # Extract only the generated strong password (removes input part if repeated)
        strong_password = generated_text.replace(weak_password, "").strip()

        if len(strong_password) >= min_length and strong_password not in strong_passwords:
            strong_passwords.append(strong_password)

    return strong_passwords

# Test with known and unknown weak passwords
test_passwords = ["password", "hello", "qwerty", "portman45", "123456", "letmein", "piyushraj"]

for weak in test_passwords:
    strong_variations = generate_strong_passwords(weak)
    print(f"Weak: {weak}")
    for i, strong in enumerate(strong_variations, 1):
        print(f"  🔹 Strong {i}: {strong}")
    print()


Weak: password
  🔹 Strong 1: 1 -> p@$$W%0Rd1$9
  🔹 Strong 2: 1 -> P@$$w0R&D135
  🔹 Strong 3: 1 -> P@$$W0Rd%13#

Weak: hello
  🔹 Strong 1: -> h3Ll@0!5%@1*
  🔹 Strong 2: -> H3Ll^00&8!%9
  🔹 Strong 3: -> h3lL*0$8^^2#

Weak: qwerty
  🔹 Strong 1: 123 -> qw3r7y1@2306
  🔹 Strong 2: uiop -> QW3R7yu1*0P5
  🔹 Strong 3: 123 -> Qw3R7y1%23!4

Weak: portman45
  🔹 Strong 1: -> P@$$w0r#N45!
  🔹 Strong 2: -> p$@7M@N45%2
  🔹 Strong 3: -> p@$&7m3R45^5

Weak: 123456
  🔹 Strong 1: -> 12345$65&^%3
  🔹 Strong 2: -> 12345!6#7@3$
  🔹 Strong 3: -> 12345!6@8$54

Weak: letmein
  🔹 Strong 1: -> L37M3#1N@!^5
  🔹 Strong 2: -> l37M3^1N!0$@
  🔹 Strong 3: -> l37m31#n3!%^

Weak: piyushraj
  🔹 Strong 1: -> p@1n#YU@2!0
  🔹 Strong 2: -> p#1y@qUw3r*%7
  🔹 Strong 3: -> p1#nK3Y@R^4%

