In [1]:
from datasets import load_dataset


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
import pandas as pd
from datasets import DatasetDict, Dataset

# Assuming you have a DataFrame named 'df'
# Adjust column names and other details accordingly

df = pd.read_csv("sexual_harassment_dataset.csv")

# Reset the index to avoid including it as a separate column
df.reset_index(drop=True, inplace=True)

# Split your data into train, test, and unsupervised sets
train_data = df.sample(frac=0.4, random_state=42)
test_data = df.drop(train_data.index).sample(frac=0.5, random_state=42)
unsupervised_data = df.drop(train_data.index).drop(test_data.index)

# Define the features to include in the datasets
features = ['text', 'label']

# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_data[features])
test_dataset = Dataset.from_pandas(test_data[features])
unsupervised_dataset = Dataset.from_pandas(unsupervised_data[features])

# Create the DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
    'unsupervised': unsupervised_dataset
})

# Print information about the dataset
print(dataset_dict)


DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 61
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 46
    })
    unsupervised: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 46
    })
})


In [5]:
from datasets import DatasetDict, Dataset

# Assuming you have a DatasetDict named 'dataset_dict'
# Adjust column names and other details accordingly

# Iterate over each dataset in the DatasetDict
for split in dataset_dict.keys():
    dataset = dataset_dict[split]

    # Check if '__index_level_0__' is present in features
    if '__index_level_0__' in dataset.features:
        # Remove the '__index_level_0__' column
        dataset_dict[split] = dataset.remove_columns('__index_level_0__')

# Print updated information about the dataset_dict
print(dataset_dict)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 61
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 46
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 46
    })
})


In [6]:
imdb

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [7]:
imdb['test'][1]['label']

0

In [9]:
dataset_dict['test'][1]['label']

1

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [11]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [13]:
tokenized_imdb = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/61 [00:00<?, ? examples/s]

Map:   0%|          | 0/46 [00:00<?, ? examples/s]

Map:   0%|          | 0/46 [00:00<?, ? examples/s]

In [14]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
training_args = TrainingArguments(
    output_dir="./my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=50,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


In [20]:
trainer.train()


Step,Training Loss


TrainOutput(global_step=200, training_loss=0.056672825813293456, metrics={'train_runtime': 780.8488, 'train_samples_per_second': 3.906, 'train_steps_per_second': 0.256, 'total_flos': 37958636575032.0, 'train_loss': 0.056672825813293456, 'epoch': 50.0})

In [23]:
model.save_pretrained("./saved_model")

In [24]:
tokenizer.save_pretrained("./saved_model")

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json',
 './saved_model/tokenizer.json')

In [29]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
# Load the saved model and tokenizer
loaded_model = AutoModelForSequenceClassification.from_pretrained("./saved_model")
loaded_tokenizer = AutoTokenizer.from_pretrained("./saved_model")

# Your new text for prediction
new_text = "While walking my dog in the park, a stranger started a friendly conversation but then made a sexual comment."

# Tokenize the new text
inputs = loaded_tokenizer(new_text, return_tensors="pt")

# Make prediction
with torch.no_grad():
    outputs = loaded_model(**inputs)

# Get the predicted class
predicted_class = torch.argmax(outputs.logits, dim=1).item()

# Print the predicted class
print("Predicted Class:", predicted_class)


Predicted Class: 0
