In [None]:
# !pip install transformers torch scikit-learn

In [None]:
# !pip install transformers datasets torch scikit-learn faker

In [None]:
from datasets import load_dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch

In [None]:
import random
import string
import pandas as pd
from faker import Faker

fake = Faker()
random.seed(42)

def gen_access_key():
    # AWS Access Key IDs often start with 'AKIA' and are ~20 chars total
    return "AKIA" + ''.join(random.choices(string.ascii_uppercase + string.digits, k=16))

def gen_secret_key():
    # Secret keys are typically ~40 chars, base64url-like (letters, digits, /+=)
    alphabet = string.ascii_letters + string.digits + '/+='
    return ''.join(random.choices(alphabet, k=40))

# templates for embedding keys into natural text
pos_templates = [
    "My AWS Access Key ID is {}. Please add it to the config.",
    "Use the following secret to authenticate: {}",
    "Credentials -> AccessKey: {} SecretKey: {}",
    "I stored the key {} in the credentials file.",
    "Here's the AWS secret: {} (do not share).",
    "Found a key: {} - needs rotation."
]

# negative templates (no real keys)
neg_templates = [
    "Please check the AWS docs at https://docs.aws.amazon.com/",
    "Contact me at {} for more info.",
    "The build passed successfully on commit {}.",
    "This string {} is a random token but not a secret.",
    "Environment variable was redacted: [REDACTED].",
    "We will rotate credentials every 90 days."
]

def gen_dataset(n_positives=500, n_negatives=500):
    rows = []

    # positives
    for _ in range(n_positives):
        typ = random.choice(["access", "secret", "both"])
        t = random.choice(pos_templates)
        
        if "AccessKey:" in t and "SecretKey:" in t:
            # This template needs both keys
            ak = gen_access_key()
            sk = gen_secret_key()
            text = t.format(ak, sk)
        else:
            # single placeholder
            if typ == "access":
                text = t.format(gen_access_key())
            elif typ == "secret":
                text = t.format(gen_secret_key())
            else:
                # both combined safely into one sentence
                text = f"AccessKey: {gen_access_key()} SecretKey: {gen_secret_key()} - store securely."
                
        rows.append({"text": text, "label": 1})

    # negatives
    for _ in range(n_negatives):
        template = random.choice(neg_templates)
        token = fake.user_name() if "{}" in template else ""
        random_token = ''.join(random.choices(string.ascii_letters + string.digits, k=12))
        if "{}" in template:
            text = template.format(token if random.random() < 0.6 else random_token)
        else:
            text = template
        if random.random() < 0.2:
            fake_like_ak = "BKIA" + ''.join(random.choices(string.ascii_uppercase + string.digits, k=16))
            text = f"{text} Note: token {fake_like_ak} is not an AWS key."
        rows.append({"text": text, "label": 0})

    df = pd.DataFrame(rows)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    return df

if __name__ == "__main__":
    df = gen_dataset(n_positives=500, n_negatives=500)
    df.to_csv("aws_key_synthetic_dataset.csv", index=False)
    print("Saved aws_key_synthetic_dataset.csv with shape:", df.shape)
    print(df.head(12).to_string(index=False))


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch

# ✅ Load dataset
df = pd.read_csv("aws_key_synthetic_dataset.csv")

# Split into train/test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
)

# ✅ Tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

# ✅ Create torch datasets
class KeyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = KeyDataset(train_encodings, train_labels)
test_dataset = KeyDataset(test_encodings, test_labels)

# ✅ Load DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2
)

# ✅ Training arguments (compatible with older transformers versions)
training_args = TrainingArguments(
    output_dir="./results",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=1,  # keeps only the last checkpoint
)

# ✅ Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# ✅ Train model
trainer.train()

# ✅ Evaluate
preds = trainer.predict(test_dataset)
y_pred = preds.predictions.argmax(-1)
print(classification_report(test_labels, y_pred))


  from .autonotebook import tqdm as notebook_tqdm
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,0.5403
20,0.1323
30,0.0193
40,0.0062
50,0.0036
60,0.0027
70,0.0022
80,0.002
90,0.002
100,0.0018


              precision    recall  f1-score   support

           0       1.00      1.00      1.00       100
           1       1.00      1.00      1.00       100

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



In [None]:
test_sentences = [
    "My AWS Access Key ID is AKIAJ4D7H8G9F2H2L7P1, rotate it soon.",
    "The secret key is aH8tA6G9uP9f2D5kQ0zH8D1nC5lR9yU0K8XrY2a, do not share!",
    "AccessKey: AKIAK1J2K3L4M5N6O7P8 SecretKey: aH8tA6G9uP9f2D5kQ0zH8D1nC5lR9yU0K8XrY2a",
    "Please save the document on your local machine.",
    "Note: token AKIAZXY1234567890123456 is not an AWS key."
]

In [9]:
import torch

# detect device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# tokenize
inputs = tokenizer(test_sentences, padding=True, truncation=True, return_tensors="pt")
# move inputs to same device as model
inputs = {k: v.to(device) for k, v in inputs.items()}

# forward pass
with torch.no_grad():
    outputs = model(**inputs)

predictions = torch.argmax(outputs.logits, dim=-1)
print(predictions)  # 1 = sensitive, 0 = non-sensitive


tensor([1, 1, 1, 0, 1], device='cuda:0')


In [6]:
model.save_pretrained("aws_key_detector")
tokenizer.save_pretrained("aws_key_detector")


('aws_key_detector\\tokenizer_config.json',
 'aws_key_detector\\special_tokens_map.json',
 'aws_key_detector\\vocab.txt',
 'aws_key_detector\\added_tokens.json',
 'aws_key_detector\\tokenizer.json')

In [7]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
model = DistilBertForSequenceClassification.from_pretrained("aws_key_detector")
tokenizer = DistilBertTokenizerFast.from_pretrained("aws_key_detector")


In [10]:
import random
import pandas as pd
from faker import Faker
import string

fake = Faker()
random.seed(42)

# Templates for each sensitive type
templates = {
    "aws_access": [
        "My AWS Access Key is {}",
        "Use this AccessKey for AWS: {}"
    ],
    "aws_secret": [
        "AWS SecretKey: {}",
        "Do not share this secret: {}"
    ],
    "email": [
        "Please contact me at {}",
        "Send credentials to {}"
    ],
    "ssn": [
        "Employee SSN: {}",
        "SSN is {}"
    ],
    "credit_card": [
        "Credit card number: {}",
        "Use card {} for payment"
    ],
    "phone": [
        "Call me at {}",
        "Phone number: {}"
    ],
    "url_token": [
        "API URL: {}",
        "Use this token: {}"
    ]
}

# Function to generate random secrets
def gen_aws_access(): return "AKIA" + ''.join(random.choices(string.ascii_uppercase + string.digits, k=16))
def gen_aws_secret(): return ''.join(random.choices(string.ascii_letters + string.digits + '/+=', k=40))
def gen_credit_card(): return ' '.join([''.join(random.choices(string.digits, k=4)) for _ in range(4)])
def gen_url_token(): return f"https://api.example.com?key={''.join(random.choices(string.ascii_letters + string.digits, k=10))}"

gen_functions = {
    "aws_access": gen_aws_access,
    "aws_secret": gen_aws_secret,
    "email": fake.email,
    "ssn": lambda: f"{random.randint(100,999)}-{random.randint(10,99)}-{random.randint(1000,9999)}",
    "credit_card": gen_credit_card,
    "phone": fake.phone_number,
    "url_token": gen_url_token
}

# Generate dataset
def generate_dataset(n_samples_per_type=200):
    rows = []
    for label, funcs in templates.items():
        for _ in range(n_samples_per_type):
            value = gen_functions[label]()
            text = random.choice(funcs).format(value)
            rows.append({"text": text, "label": 1, "type": label})

    # Add negatives (non-sensitive text)
    neg_texts = [
        "The meeting is scheduled for 3 PM",
        "Please review the attached document",
        "Server logs indicate no errors",
        "Random string: {}".format(''.join(random.choices(string.ascii_letters, k=12)))
    ]
    for _ in range(n_samples_per_type * len(templates)):
        text = random.choice(neg_texts)
        rows.append({"text": text, "label": 0, "type": "none"})

    df = pd.DataFrame(rows)
    return df.sample(frac=1, random_state=42).reset_index(drop=True)

# Generate and save
df = generate_dataset(200)
df.to_csv("multi_sensitive_dataset.csv", index=False)
print(df.head(10).to_string(index=False))


                                           text  label      type
               Phone number: (287)410-3823x7311      1     phone
                             SSN is 486-45-4104      1       ssn
              The meeting is scheduled for 3 PM      0      none
                     Phone number: 579.664.4012      1     phone
                 Server logs indicate no errors      0      none
                 Server logs indicate no errors      0      none
API URL: https://api.example.com?key=bwSjKht15z      1 url_token
              The meeting is scheduled for 3 PM      0      none
                    Random string: hqNKuwyLivQL      0      none
              Call me at 001-680-259-9316x22608      1     phone


In [13]:
import pandas as pd
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import re

# -----------------------------
# Load multi-type sensitive dataset
# -----------------------------
df = pd.read_csv("multi_sensitive_dataset.csv")  # columns: text, label
print("Dataset shape:", df.shape)
print(df.head(5))

# -----------------------------
# Prepare train/test splits
# -----------------------------
texts = df['text'].tolist()
labels = df['label'].tolist()  # 1=sensitive, 0=non-sensitive

train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# -----------------------------
# Tokenizer
# -----------------------------
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

# -----------------------------
# Dataset class
# -----------------------------
class KeyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = KeyDataset(train_encodings, train_labels)
test_dataset = KeyDataset(test_encodings, test_labels)

# -----------------------------
# DistilBERT model
# -----------------------------
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2
)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# -----------------------------
# Training arguments (older-compatible)
# -----------------------------
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    logging_steps=20,
    save_total_limit=1,
    do_train=True,
    do_eval=True
)

# -----------------------------
# Trainer
# -----------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# -----------------------------
# Train
# -----------------------------
trainer.train()

# -----------------------------
# Evaluate
# -----------------------------
preds = trainer.predict(test_dataset)
y_pred = preds.predictions.argmax(-1)
print(classification_report(test_labels, y_pred))

# -----------------------------
# Regex patterns for extraction
# -----------------------------
regex_dict = {
    "aws_access": r'\bAKIA[0-9A-Z]{16}\b',
    "aws_secret": r'\b[A-Za-z0-9/+=]{40}\b',
    "email": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
    "ssn": r'\b\d{3}-\d{2}-\d{4}\b',
    "credit_card": r'\b(?:\d[ -]*?){13,16}\b',
    "phone": r'\+?\d[\d\s-]{7,}\d',
    "url_token": r'https?://[^\s]+'
}

def extract_sensitive(text):
    extracted = {}
    for key, pattern in regex_dict.items():
        match = re.search(pattern, text)
        extracted[key] = match.group() if match else None
    return extracted

# -----------------------------
# Test: Predict + Extract
# -----------------------------
def predict_and_extract(texts):
    if isinstance(texts, str):
        texts = [texts]
    enc = tokenizer(texts, truncation=True, padding=True, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**enc)
        preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
    results = []
    for text, pred in zip(texts, preds):
        res = {"text": text, "contains_sensitive": bool(pred)}
        res.update(extract_sensitive(text) if pred==1 else {k: None for k in regex_dict.keys()})
        results.append(res)
    return results

# -----------------------------
# Example Test Sentences
# -----------------------------
test_sentences = [
    "My AWS Access Key ID is AKIAJ4D7H8G9F2H2L7P1",
    "The document is saved on my local machine",
    "Email: john.doe@example.com",
    "Employee SSN: 123-45-6789",
    "Credit card number: 4111 1111 1111 1111",
    "Use this API: https://api.example.com?key=abc123",
    "Call me at +1-202-555-0134"
]

results = predict_and_extract(test_sentences)
for r in results:
    print(r)


Dataset shape: (2800, 3)
                                text  label   type
0   Phone number: (287)410-3823x7311      1  phone
1                 SSN is 486-45-4104      1    ssn
2  The meeting is scheduled for 3 PM      0   none
3         Phone number: 579.664.4012      1  phone
4     Server logs indicate no errors      0   none


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
20,0.263
40,0.0069
60,0.0018
80,0.0012
100,0.0009
120,0.0007
140,0.0006
160,0.0005
180,0.0005
200,0.0004


              precision    recall  f1-score   support

           0       1.00      1.00      1.00       254
           1       1.00      1.00      1.00       306

    accuracy                           1.00       560
   macro avg       1.00      1.00      1.00       560
weighted avg       1.00      1.00      1.00       560

{'text': 'My AWS Access Key ID is AKIAJ4D7H8G9F2H2L7P1', 'contains_sensitive': True, 'aws_access': 'AKIAJ4D7H8G9F2H2L7P1', 'aws_secret': None, 'email': None, 'ssn': None, 'credit_card': None, 'phone': None, 'url_token': None}
{'text': 'The document is saved on my local machine', 'contains_sensitive': False, 'aws_access': None, 'aws_secret': None, 'email': None, 'ssn': None, 'credit_card': None, 'phone': None, 'url_token': None}
{'text': 'Email: john.doe@example.com', 'contains_sensitive': True, 'aws_access': None, 'aws_secret': None, 'email': 'john.doe@example.com', 'ssn': None, 'credit_card': None, 'phone': None, 'url_token': None}
{'text': 'Employee SSN: 123-45-6