In [None]:
!pip install datasets


Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.1-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xx

In [None]:
import tensorflow_datasets as tfds
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
# Load IMDB dataset from TensorFlow Datasets
dataset = tfds.load('imdb_reviews', split=['train', 'test'], as_supervised=True)
train_data, test_data = dataset

In [None]:
# Convert TensorFlow dataset to Hugging Face Dataset
def convert_to_hf_dataset(tf_dataset, sample_size=1000):  # Reduced sample size
    texts, labels = [], []
    for i, (text, label) in enumerate(tf_dataset):
        if i >= sample_size:
            break
        texts.append(text.numpy().decode('utf-8'))
        labels.append(int(label.numpy()))
    return Dataset.from_dict({'text': texts, 'label': labels})

hf_train_dataset = convert_to_hf_dataset(train_data, sample_size=1000)
hf_test_dataset = convert_to_hf_dataset(test_data, sample_size=500)


In [None]:
# Choose Transformer models for comparison
models = [
    "distilbert-base-uncased"
]

results = {}

def preprocess_data(examples, tokenizer):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=128)  # Reduced max_length

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

for model_name in models:
    print(f"Training model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    tokenized_train = hf_train_dataset.map(lambda x: preprocess_data(x, tokenizer), batched=True)
    tokenized_test = hf_test_dataset.map(lambda x: preprocess_data(x, tokenizer), batched=True)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    training_args = TrainingArguments(
        output_dir=f"./results_{model_name}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        per_device_train_batch_size=32,  # Increased batch size
        per_device_eval_batch_size=32,
        num_train_epochs=1,  # Kept training minimal
        weight_decay=0.01,
        logging_dir=f"./logs_{model_name}",
        logging_steps=50,
        push_to_hub=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    eval_results = trainer.evaluate()
    results[model_name] = eval_results

Training model: distilbert-base-uncased


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.521347,0.806,0.806452,0.803213,0.804829


In [None]:

# Compare models
for model, metrics in results.items():
    print(f"Model: {model}")
    print(f"Accuracy: {metrics['eval_accuracy']:.4f}, F1 Score: {metrics['eval_f1']:.4f}")

Model: distilbert-base-uncased
Accuracy: 0.8060, F1 Score: 0.8048


In [1]:
from transformers import pipeline

# Load the text generation pipeline with a pre-trained GPT model
generator = pipeline("text-generation", model="gpt2")

# Define the prompt
prompt = "In a distant future, humanity has discovered"

# Generate text
story = generator(prompt, max_length=100, num_return_sequences=1, temperature=0.7)

# Print the generated story
print(story[0]['generated_text'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a distant future, humanity has discovered the limits of the universe and in that future, it will become the most efficient and efficient way to explore the universe.

MISSION STATEMENTS

The mission statement of the International Society for the Study of Nature is that "Nature is a source of all that is natural, and a source of all that is spiritual." The goal of the International Society is to promote and advance the teaching and practice of the natural sciences in the areas of psychology,
