In [37]:
# Question 1: Sentiment Analysis with Transformers
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd
from transformers import (
    pipeline,
    GPT2LMHeadModel,
    GPT2Tokenizer,
    AutoTokenizer,
    AutoModelForSequenceClassification
)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
import time
warnings.filterwarnings('ignore')

In [38]:
# Load IMDB dataset from TensorFlow Datasets
print("Loading IMDB dataset...")
(train_ds, test_ds), info = tfds.load(
    'imdb_reviews',
    split=['train', 'test'],
    shuffle_files=True,
    as_supervised=True,
    with_info=True
)

print(f"Dataset info: {info.features}")
print(f"Training samples: {info.splits['train'].num_examples}")
print(f"Test samples: {info.splits['test'].num_examples}")

Loading IMDB dataset...
Dataset info: FeaturesDict({
    'label': ClassLabel(shape=(), dtype=int64, num_classes=2),
    'text': Text(shape=(), dtype=string),
})
Training samples: 25000
Test samples: 25000


In [39]:
# Convert to lists for easier processing
print("Processing dataset...")
test_texts = []
test_labels = []
for i, (text, label) in enumerate(test_ds.take(1000)):
    test_texts.append(text.numpy().decode('utf-8'))
    test_labels.append(label.numpy())
    if i % 200 == 0:
        print(f"Processed {i+1} samples...")

print(f"Processed {len(test_texts)} test samples")

Processing dataset...
Processed 1 samples...
Processed 201 samples...
Processed 401 samples...
Processed 601 samples...
Processed 801 samples...
Processed 1000 test samples


In [40]:
# Example of data
print(f"\nSample review: {test_texts[0][:300]}...")
print(f"Label: {'Positive' if test_labels[0] == 1 else 'Negative'}")


Sample review: There are films that make careers. For George Romero, it was NIGHT OF THE LIVING DEAD; for Kevin Smith, CLERKS; for Robert Rodriguez, EL MARIACHI. Add to that list Onur Tukel's absolutely amazing DING-A-LING-LESS. Flawless film-making, and as assured and as professional as any of the aforementioned ...
Label: Positive


In [41]:
print("Loading BERT sentiment analysis pipeline...")
try:
    bert_classifier = pipeline(
        "sentiment-analysis",
        model="nlptown/bert-base-multilingual-uncased-sentiment",
        return_all_scores=False
    )
except Exception as e:
    bert_classifier = pipeline(
        "sentiment-analysis",
        model="cardiffnlp/twitter-roberta-base-sentiment-latest",
        return_all_scores=False
    )

print("Evaluating BERT model...")
bert_predictions = []
bert_confidences = []

start_time = time.time()
for i, text in enumerate(test_texts[:500]):  # Use subset for faster processing
    try:
        truncated_text = text[:512]
        result = bert_classifier(truncated_text)

        # Convert to binary labels (0=negative, 1=positive)
        if result[0]['label'] in ['POSITIVE', 'LABEL_2', '5 stars', '4 stars']:
            bert_predictions.append(1)
        else:
            bert_predictions.append(0)

        bert_confidences.append(result[0]['score'])

        if i % 100 == 0:
            print(f"Processed {i+1}/500 samples...")

    except Exception as e:
        print(f"Error processing sample {i}: {e}")
        bert_predictions.append(0)  # Default to negative
        bert_confidences.append(0.5)

bert_time = time.time() - start_time
bert_accuracy = accuracy_score(test_labels[:len(bert_predictions)], bert_predictions)

print(f"BERT Processing Time: {bert_time:.2f} seconds")
print(f"BERT Accuracy: {bert_accuracy:.4f}")
print(f"Average Confidence: {np.mean(bert_confidences):.4f}")

Loading BERT sentiment analysis pipeline...


Device set to use cpu


Evaluating BERT model...
Processed 1/500 samples...
Processed 101/500 samples...
Processed 201/500 samples...
Processed 301/500 samples...
Processed 401/500 samples...
BERT Processing Time: 285.95 seconds
BERT Accuracy: 0.7900
Average Confidence: 0.5418


In [42]:
print("Loading DistilBERT sentiment analysis pipeline...")
try:
    distilbert_classifier = pipeline(
        "sentiment-analysis",
        model="distilbert-base-uncased-finetuned-sst-2-english",
        return_all_scores=False
    )
except Exception as e:
    distilbert_classifier = None

if distilbert_classifier:
    print("Evaluating DistilBERT model...")
    distilbert_predictions = []
    distilbert_confidences = []

    start_time = time.time()
    for i, text in enumerate(test_texts[:500]):
        try:
            truncated_text = text[:512]
            result = distilbert_classifier(truncated_text)

            if result[0]['label'] == 'POSITIVE':
                distilbert_predictions.append(1)
            else:
                distilbert_predictions.append(0)

            distilbert_confidences.append(result[0]['score'])

            if i % 100 == 0:
                print(f"Processed {i+1}/500 samples...")

        except Exception as e:
            print(f"Error processing sample {i}: {e}")
            distilbert_predictions.append(0)
            distilbert_confidences.append(0.5)

    distilbert_time = time.time() - start_time
    distilbert_accuracy = accuracy_score(test_labels[:len(distilbert_predictions)], distilbert_predictions)

    print(f"DistilBERT Processing Time: {distilbert_time:.2f} seconds")
    print(f"DistilBERT Accuracy: {distilbert_accuracy:.4f}")
    print(f"Average Confidence: {np.mean(distilbert_confidences):.4f}")
else:
    distilbert_accuracy = 0
    distilbert_time = 0

Loading DistilBERT sentiment analysis pipeline...


Device set to use cpu


Evaluating DistilBERT model...
Processed 1/500 samples...
Processed 101/500 samples...
Processed 201/500 samples...
Processed 301/500 samples...
Processed 401/500 samples...
DistilBERT Processing Time: 126.10 seconds
DistilBERT Accuracy: 0.8340
Average Confidence: 0.9774


In [43]:
print("Loading RoBERTa sentiment analysis pipeline...")
try:
    roberta_classifier = pipeline(
        "sentiment-analysis",
        model="cardiffnlp/twitter-roberta-base-sentiment-latest",
        return_all_scores=True
    )
except Exception as e:
    roberta_classifier = None

if roberta_classifier:
    print("Evaluating RoBERTa model...")
    roberta_predictions = []
    roberta_confidences = []

    start_time = time.time()
    for i, text in enumerate(test_texts[:500]):
        try:
            truncated_text = text[:512]
            results = roberta_classifier(truncated_text)

            # Find positive sentiment (LABEL_2)
            pos_score = next((r['score'] for r in results[0] if r['label'] == 'LABEL_2'), 0)
            neg_score = next((r['score'] for r in results[0] if r['label'] == 'LABEL_0'), 0)

            if pos_score > neg_score:
                roberta_predictions.append(1)
                roberta_confidences.append(pos_score)
            else:
                roberta_predictions.append(0)
                roberta_confidences.append(neg_score)

            if i % 100 == 0:
                print(f"Processed {i+1}/500 samples...")

        except Exception as e:
            print(f"Error processing sample {i}: {e}")
            roberta_predictions.append(0)
            roberta_confidences.append(0.5)

    roberta_time = time.time() - start_time
    roberta_accuracy = accuracy_score(test_labels[:len(roberta_predictions)], roberta_predictions)

    print(f"RoBERTa Processing Time: {roberta_time:.2f} seconds")
    print(f"RoBERTa Accuracy: {roberta_accuracy:.4f}")
    print(f"Average Confidence: {np.mean(roberta_confidences):.4f}")
else:
    roberta_accuracy = 0
    roberta_time = 0

Loading RoBERTa sentiment analysis pipeline...


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


Evaluating RoBERTa model...
Processed 1/500 samples...
Processed 101/500 samples...
Processed 201/500 samples...
Processed 301/500 samples...
Processed 401/500 samples...
RoBERTa Processing Time: 238.54 seconds
RoBERTa Accuracy: 0.5020
Average Confidence: 0.0000


In [44]:
#comparision
models_data = []
if bert_accuracy > 0:
    models_data.append(['BERT-base', bert_accuracy, bert_time, '110M'])
if distilbert_accuracy > 0:
    models_data.append(['DistilBERT', distilbert_accuracy, distilbert_time, '66M'])
if roberta_accuracy > 0:
    models_data.append(['RoBERTa', roberta_accuracy, roberta_time, '125M'])

if models_data:
    results_df = pd.DataFrame(models_data, columns=['Model', 'Accuracy', 'Time(s)', 'Parameters'])
    print(results_df.to_string(index=False))

    # Find best model
    best_model_idx = results_df['Accuracy'].idxmax()
    best_model = results_df.iloc[best_model_idx]

    print(f"\n BEST PERFORMING MODEL: {best_model['Model']}")
    print(f"Accuracy: {best_model['Accuracy']:.4f}")
    print(f"Processing Time: {best_model['Time(s)']:.2f} seconds")

    # Detailed analysis for best model
    print("\nDETAILED CLASSIFICATION REPORT (Best Model):")
    if best_model['Model'] == 'BERT-base':
        print(classification_report(test_labels[:len(bert_predictions)], bert_predictions,
                                  target_names=['Negative', 'Positive']))
    elif best_model['Model'] == 'DistilBERT':
        print(classification_report(test_labels[:len(distilbert_predictions)], distilbert_predictions,
                                  target_names=['Negative', 'Positive']))
    elif best_model['Model'] == 'RoBERTa':
        print(classification_report(test_labels[:len(roberta_predictions)], roberta_predictions,
                                  target_names=['Negative', 'Positive']))

     Model  Accuracy    Time(s) Parameters
 BERT-base     0.790 285.951953       110M
DistilBERT     0.834 126.103485        66M
   RoBERTa     0.502 238.536366       125M

 BEST PERFORMING MODEL: DistilBERT
Accuracy: 0.8340
Processing Time: 126.10 seconds

DETAILED CLASSIFICATION REPORT (Best Model):
              precision    recall  f1-score   support

    Negative       0.81      0.87      0.84       251
    Positive       0.86      0.80      0.83       249

    accuracy                           0.83       500
   macro avg       0.84      0.83      0.83       500
weighted avg       0.84      0.83      0.83       500



In [45]:
# QUESTION 2: TEXT GENERATION WITH TRANSFORMERS

# Load GPT-2 model for text generation
try:
    text_generator = pipeline(
        "text-generation",
        model="gpt2",
        tokenizer="gpt2",
        device=-1
    )
    print("GPT-2 model loaded successfully")
except Exception as e:
    print(f" Error loading GPT-2: {e}")
    text_generator = None

if text_generator:
    # Given prompt
    prompt = "In a distant future, humanity has discovered"

    print(f"Prompt: '{prompt}'")
    print("\n" + "-" * 60)
    print("GENERATED STORIES:")
    print("-" * 60)

    # Generate multiple versions with different parameters
    generation_configs = [
        {"max_length": 150, "temperature": 0.7, "do_sample": True, "top_p": 0.9, "name": "Balanced"},
        {"max_length": 150, "temperature": 0.5, "do_sample": True, "top_k": 50, "name": "Conservative"},
        {"max_length": 150, "temperature": 0.9, "do_sample": True, "top_p": 0.8, "name": "Creative"}
    ]

    for i, config in enumerate(generation_configs, 1):
        print(f"\nSTORY {i} - {config['name']} (Temperature: {config['temperature']}):")
        print("-" * 50)

        try:
            generated = text_generator(
                prompt,
                max_length=config["max_length"],
                temperature=config["temperature"],
                do_sample=config["do_sample"],
                top_p=config.get("top_p", 1.0),
                top_k=config.get("top_k", 0),
                pad_token_id=50256,
                num_return_sequences=1,
                truncation=True
            )

            story = generated[0]['generated_text']
            print(story)
            print(f"\nLength: {len(story.split())} words")

        except Exception as e:
            print(f"Error generating story: {e}")

    # ADDITIONAL TEXT GENERATION EXAMPLES
    additional_prompts = [
        "The last library on Earth contained",
        "When artificial intelligence gained consciousness, it",
        "In the depths of the ocean, scientists discovered"
    ]

    for j, prompt in enumerate(additional_prompts, 1):
        print(f"\n{j}. Prompt: '{prompt}'")
        print("-" * 30)

        try:
            generated = text_generator(
                prompt,
                max_length=120,
                temperature=0.8,
                do_sample=True,
                top_p=0.9,
                pad_token_id=50256,
                num_return_sequences=1,
                truncation=True
            )

            story = generated[0]['generated_text']
            print(story)

        except Exception as e:
            print(f"Error generating story: {e}")

Device set to use cpu
Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


GPT-2 model loaded successfully
Prompt: 'In a distant future, humanity has discovered'

------------------------------------------------------------
GENERATED STORIES:
------------------------------------------------------------

STORY 1 - Balanced (Temperature: 0.7):
--------------------------------------------------


Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


In a distant future, humanity has discovered the secrets of the past and the secrets of the future.

The future of humanity is not a living future, but an extinct one. We are at a crossroads. Our civilization is at a crossroads. We can no longer be the people who have lived through the past. We have to make our way through the present. We have to make our way through the future.

I'll be speaking at the National Public Radio Conference on September 28, 2013, in Las Vegas, Nevada.

The following is an excerpt from a talk I gave last year in New Orleans, Louisiana, on the topic of the future of civilization.

"I'm talking about the future of civilization," you say. "We are at a crossroads."

You are right.

I know it's difficult to say this, but I'll give you an example. The past is not the future. The past is the future. We are living through an old era. We are living through an age of scarcity and a time of loss. We are living through an age of overpopulation. We are living through an 

Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


In a distant future, humanity has discovered that it is possible to create an alternative energy source, a source of energy that is not only more efficient but is also more sustainable.

"We need to get to the point where we can build a system that is sustainable," said Dr. A.J. Gurney, a physicist at the University of California, Los Angeles.

The idea is to build a system that is sustainable only by applying a certain amount of energy to the environment. The system would be able to produce a high level of energy from sunlight, which is needed to power a turbine. The system would also be able to use sunlight to generate electricity.

"We're doing it in a way that is sustainable, but it's not sustainable in a way that is sustainable in a way that is sustainable in a way that is sustainable in a way that is sustainable in a way that is sustainable in a way that is sustainable in a way that is sustainable in a way that is sustainable in a way that is sustainable in a way that is sustaina

Both `max_new_tokens` (=256) and `max_length`(=120) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


In a distant future, humanity has discovered a way to solve some of the world's most pressing problems: climate change and the nuclear option. This kind of technology could make a huge difference in the lives of millions of people. But how do we know what happens to our world?

In a world of constant conflict, the "fire and fury" of war has been both compelling and incredibly detrimental to the very survival of humankind. A few of us are simply better off today than we were when we were first born. But these differences have caused a great deal of stress for some of us.

Take my dad. A man who spends most of his time in a wheelchair and constantly suffers from a debilitating condition that will likely be difficult to diagnose. While he has his own company, I have the honor of serving in a team that includes Dr. Neuberger, who also happens to be a nuclear physicist. In fact, he is the first member of the team to be awarded the Nobel Prize in Physiology or Medicine in 1993.

I feel like 

Both `max_new_tokens` (=256) and `max_length`(=120) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


The last library on Earth contained a very small number of books, many of which were devoted to the Bible. But some of them were not very long, and some were only about a year old.

It was the first time in our history that any book had been preserved. The oldest book on the planet is a little over a century old, and its contents are quite extensive. It is a large collection of manuscripts of the Bible which was published in 1620. The oldest manuscripts were collected in 1836, and were in use from 1872 until 1876. It is still a very interesting book.

The book is made up of a number of sections, some of which are very long, and some of which are very short. The main section in particular is very short. It consists of a long introduction to the subject.

The Bible contains many other documents, especially the Book of Mormon. It is the only major document on the planet that contains the Bible in its entirety.

The book contains many other items, including a copy of the Bible and other bo

Both `max_new_tokens` (=256) and `max_length`(=120) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


When artificial intelligence gained consciousness, it became increasingly difficult for the human race to comprehend the truth, or even to understand the world around us.

This is why the AI revolution has always been difficult, because the human race has been in the forefront of the battle against AI and has been fighting against the very notion of being a "superintelligence".

To give you an idea of the problem of human versus AI, imagine that you are a child who is playing with your grandfather's dog in a park. Your grandfather has been fighting the evil AI for over a year, and it's getting more and more vicious, and your grandmother is watching over you.

The two of you will have different stories, and in the end your grandfather's dog is going to kill you.

This is what happens when you don't have the tools to deal with the evil AI.

This is what happens when you don't have the tools to deal with the evil AI.

You're not smart enough to fight it. You're not smart enough to underst