In [11]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, BertTokenizer
import torch
from torch.utils.data import Dataset, DataLoader

# Define a custom dataset class
class PrivacyPolicyDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Example data
texts = ["Your privacy is protected.", "We do not collect and share your data with third parties."]
labels = [1, 0]  # 1 for good, 0 for bad

# Initialize tokenizer and dataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataset = PrivacyPolicyDataset(texts, labels, tokenizer, max_length=128)

# Data loader
dataloader = DataLoader(dataset, batch_size=2)

# Initialize model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    logging_dir='./logs'
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

# Train model
trainer.train()

# Save the model and tokenizer
model.save_pretrained('./results')
tokenizer.save_pretrained('./results')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


('./results\\tokenizer_config.json',
 './results\\special_tokens_map.json',
 './results\\vocab.txt',
 './results\\added_tokens.json')

In [17]:
from transformers import pipeline

# Load fine-tuned model
model = BertForSequenceClassification.from_pretrained('./results')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create a pipeline for text classification
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)

# New privacy policy
new_policy = "Your privacy is not protected, We collect and share your data with third parties."

# Get prediction
result = classifier(new_policy)
print(result)  # This will print the predicted label and score


[{'label': 'LABEL_0', 'score': 0.5905711054801941}]


In [1]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import glue_convert_examples_to_features, glue_processors
import tensorflow as tf

# Example data
texts = ["Your privacy is important to us.", "We collect and share your data."]
labels = [1, 0]  # 1 for good, 0 for bad

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and encode the data
encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)

# Convert to TensorFlow Dataset
def convert_to_tf_dataset(encodings, labels):
    return tf.data.Dataset.from_tensor_slices((
        dict(encodings),
        labels
    ))

train_dataset = convert_to_tf_dataset(encodings, labels).shuffle(len(texts)).batch(2)

# Load pre-trained BERT model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

# Train the model
model.fit(train_dataset, epochs=3)

# Save the model and tokenizer
model.save_pretrained('./results')
tokenizer.save_pretrained('./results')






All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Cause: for/else statement not yet supported
Cause: for/else statement not yet supported



Epoch 2/3
Epoch 3/3


('./results\\tokenizer_config.json',
 './results\\special_tokens_map.json',
 './results\\vocab.txt',
 './results\\added_tokens.json')

In [2]:
from transformers import BertTokenizer, TFBertForSequenceClassification, pipeline

# Load the fine-tuned model and tokenizer
model = TFBertForSequenceClassification.from_pretrained('./results')
tokenizer = BertTokenizer.from_pretrained('./results')

# Create a pipeline for text classification
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)

# New privacy policy
new_policy = "We ensure your data is protected and not shared with third parties."

# Get prediction
result = classifier(new_policy)
print(result)  # This will print the predicted label and score


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

All the weights of TFBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


[{'label': 'LABEL_0', 'score': 0.6175602674484253}]


In [8]:
import gensim
import numpy as np

# Example data (replace this with your actual data)
data = [
    ['hello', 'world'],
    ['example', 'sentence']
]

# Train Word2Vec model
model = gensim.models.Word2Vec(data, min_count=1, vector_size=2, window=5)

# Get the word vectors and corresponding vocabulary
word_vectors = model.wv
vocabulary = word_vectors.index_to_key

# Save word vectors to a text file
output_file_path = "word_vectors.txt"

with open(output_file_path, 'w', encoding='utf-8') as file:
    # Write header: number of words and vector dimensionality
    file.write(f"{len(vocabulary)} {model.vector_size}\n")
    
    # Write each word and its corresponding vector
    for word in vocabulary:
        vector = ' '.join(map(str, word_vectors[word]))
        file.write(f"{word} {vector}\n")
