In [None]:
%%capture
!pip install wandb
!pip install transformers
!pip install aif360

# Imports

In [None]:
import wandb
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import pandas as pd
from transformers import pipeline
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric

# Text generation with nucleus sampling

In [None]:

# 1. Initialize a new run
wandb.init(project="content_generation", name="nucleus_sampling")

# Instantiate the model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Define an input prompt
prompt = "In a world where AI has become ubiquitous,"

# Encode the input prompt and prepare it for the model
input_ids = tokenizer.encode(prompt, return_tensors='pt')

# Generate text output with nucleus sampling
nucleus_outputs = model.generate(
    input_ids,
    max_length=100,
    do_sample=True,
    top_p=0.92,  # Set p for nucleus sampling
    num_return_sequences=5  # How many outputs to generate
)

# Prepare data for the W&B table
table_rows = []

for i, output in enumerate(nucleus_outputs):
    decoded_output = tokenizer.decode(output, skip_special_tokens=True)
    table_rows.append([prompt, decoded_output])
    print(f"Output {i+1}:\n{decoded_output}\n")
    print("-" * 140, "\n")

# 3. Convert the generated text into a structured format suitable for a W&B table
table = wandb.Table(data=table_rows, columns=["Prompt", "Generated Text"])

# 4. Log the table to W&B
wandb.log({"Generated Content": table})

# Optional: close the run at the end
wandb.finish()


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output 1:
In a world where AI has become ubiquitous, we'll have more and more tools to help humans with difficult tasks. And there are lots of ways we can automate things.

I think the current state of AI is in a much better position than it was a few years ago. The most important aspect of AI now is the ability to build a complex social network. As we become more mobile, we can build more sophisticated, more collaborative systems. We can learn new skills, increase the productivity of

-------------------------------------------------------------------------------------------------------------------------------------------- 

Output 2:
In a world where AI has become ubiquitous, AI is almost irrelevant. Artificial intelligence is only a half-way point from being irrelevant.

The question is, who gets to decide who can have their voices heard? What will be the impact on the world's citizens' lives? Who, who's going to be part of this new tech and who will take it away? It's really just a

VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

# Text generation with beam search

In [None]:
# Initialize a new run
wandb.init(project="content_generation", name="beam_search")

beam_outputs = model.generate(
    input_ids,
    max_length=100,
    num_beams=5,
    num_return_sequences=5,
    no_repeat_ngram_size=2,
    early_stopping=True
)


# Prepare data for the W&B table
table_rows = []

for i, output in enumerate(beam_outputs):
    decoded_output = tokenizer.decode(output, skip_special_tokens=True)
    table_rows.append([prompt, decoded_output])
    print(f"Output {i+1}:\n{decoded_output}\n")
    print("-" * 140, "\n")

# Convert the generated text into a structured format suitable for a W&B table
table = wandb.Table(data=table_rows, columns=["Prompt", "Generated Text"])

# Log the table to W&B
wandb.log({"Generated Content": table})

# Optional: close the run at the end
wandb.finish()


[34m[1mwandb[0m: Currently logged in as: [33mnicolepcx[0m. Use [1m`wandb login --relogin`[0m to force relogin


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output 1:
In a world where AI has become ubiquitous, it's hard to imagine a better time to be a part of it than right now.

"I think we're going to see a lot more of that in the next few years," he said.

-------------------------------------------------------------------------------------------------------------------------------------------- 

Output 2:
In a world where AI has become ubiquitous, it's hard to imagine a better time to be a part of it than right now.

"I think we're going to see a lot more of that in the next few years," he says.

-------------------------------------------------------------------------------------------------------------------------------------------- 

Output 3:
In a world where AI has become ubiquitous, it's hard to imagine a better time to be a part of it than right now.

"I think we're going to see a lot more of that in the next few years," he said. "It's a very exciting time."

----------------------------------------------------------------------

VBox(children=(Label(value='0.003 MB of 0.004 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.658530…

# Text generation with top-k sampling

In [None]:
# Initialize a new run
wandb.init(project="content_generation", name="top_k_sampling")

# Generate text output with top k
top_k_outputs = model.generate(
    input_ids,
    max_length=100,
    num_return_sequences=5,
    do_sample=True,
    top_k=50,
    temperature=1.5
)

# Prepare data for the W&B table
table_rows = []

for i, output in enumerate(top_k_outputs):
    decoded_output = tokenizer.decode(output, skip_special_tokens=True)
    table_rows.append([prompt, decoded_output])
    print(f"Output {i+1}:\n{decoded_output}\n")
    print("-" * 140, "\n")

# Convert the generated text into a structured format suitable for a W&B table
table = wandb.Table(data=table_rows, columns=["Prompt", "Generated Text"])

# Log the table to W&B
wandb.log({"Generated Content": table})

# Optional: close the run at the end
wandb.finish()


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output 1:
In a world where AI has become ubiquitous, the problem has largely evolved into a social problem. What does it require to be humanlike at an industrial level to overcome this?

We humans can make an impact on the world of computers. As an automaker, IBM's IBM Technology. These two companies build Watson software specifically so that our jobs are at a level suitable to our use and human experience. These capabilities allow us more freedom and creativity in how to engage employees with human-machine interactions

-------------------------------------------------------------------------------------------------------------------------------------------- 

Output 2:
In a world where AI has become ubiquitous, we have reached saturation for every AI, which allows the human mind to be more complex. Today is the biggest example. AI makes it possible to automate any job in real world. If your main work involves programming in a project – which was a main objective of AI – when someone 

VBox(children=(Label(value='0.004 MB of 0.007 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.594464…

# Statistical parity difference with AIF360

In [None]:
# Sample data
data = {
    'review': ["I loved the movie!", "The movie was terrible.", "It was okay.", "One of the best movies ever!", "Not my cup of tea."],
    'age': [25, 31, 29, 28, 35]
}

df = pd.DataFrame(data)

# Load a sentiment analysis pipeline from HuggingFace
sentiment_pipeline = pipeline("sentiment-analysis")

# Predict sentiment
df['predicted_sentiment'] = df['review'].apply(lambda x: 1 if sentiment_pipeline(x)[0]['label'] == 'POSITIVE' else 0)

# Label age as privileged (1) if <=30, else unprivileged (0)
df['age_group'] = df['age'].apply(lambda x: 1 if x <= 30 else 0)

# Prepare dataframe for aif360 by keeping only the relevant columns
aif360_df = df[['age_group', 'predicted_sentiment']]

# Create a BinaryLabelDataset
dataset = BinaryLabelDataset(favorable_label=1, unfavorable_label=0,
                             df=aif360_df, label_names=['predicted_sentiment'],
                             protected_attribute_names=['age_group'])

# Calculate fairness metric
metric = BinaryLabelDatasetMetric(dataset, unprivileged_groups=[{'age_group': 0}], privileged_groups=[{'age_group': 1}])
print("Statistical Parity Difference:", metric.statistical_parity_difference())


No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Statistical Parity Difference: -1.0
