<a href="https://colab.research.google.com/github/RealAdon/Databrix-LLM101x-Chapter-1.8/blob/main/app.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# !pip install datasets
# !pip install transformers
# !pip install sentencepiece

In [2]:
import os

import pandas as pd

from datasets import load_dataset
from transformers import pipeline

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Define th cache directory
cache_dir = './cache'

### Download the dataset

In [5]:
xsum_dataset = load_dataset(
    "xsum",
    version ="1.2.0",
    cache_dir=cache_dir, # Define a download folder to cache the dataset
)
xsum_dataset

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})

In [6]:
xsum_sample = xsum_dataset['train'].select(range(10))
display(xsum_sample.to_pandas())

Unnamed: 0,document,summary,id
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035
2,Ferrari appeared in a position to challenge un...,Lewis Hamilton stormed to pole position at the...,35951548
3,"John Edward Bates, formerly of Spalding, Linco...",A former Lincolnshire Police officer carried o...,36266422
4,Patients and staff were evacuated from Cerahpa...,An armed man who locked himself into a room at...,38826984
5,Simone Favaro got the crucial try with the las...,Defending Pro12 champions Glasgow Warriors bag...,34540833
6,"Veronica Vanessa Chango-Alverez, 31, was kille...",A man with links to a car that was involved in...,20836172
7,Belgian cyclist Demoitie died after a collisio...,Welsh cyclist Luke Rowe says changes to the sp...,35932467
8,"Gundogan, 26, told BBC Sport he ""can see the f...",Manchester City midfielder Ilkay Gundogan says...,40758845
9,The crash happened about 07:20 GMT at the junc...,A jogger has been hit by an unmarked police ca...,30358490


### Summarization

In [7]:
# Load pipeline
summarizer = pipeline(
    task="summarization",
    model='t5-small',
    min_length=20,
    max_length=35,
    truncation=True,
    model_kwargs={'cache_dir': cache_dir})

In [8]:
# Apply to 1 article
summarizer(xsum_sample['document'][1])

[{'summary_text': 'a fire alarm went off at the Holiday Inn in Hope Street on Saturday . guests were asked to leave the hotel . the two buses have organised replacement coaches'}]

In [9]:
# Apply to a batch of articles
summarizer(xsum_sample['document'])

[{'summary_text': 'the full cost of damage in Newton Stewart is still being assessed . many roads in peeblesshire remain badly affected by standing water . a flood alert'},
 {'summary_text': 'a fire alarm went off at the Holiday Inn in Hope Street on Saturday . guests were asked to leave the hotel . the two buses have organised replacement coaches'},
 {'summary_text': 'Sebastian Vettel will start third ahead of team-mate Kimi Raikkonen . stewards only handed Hamilton a reprimand after '},
 {'summary_text': 'the 67-year-old is accused of committing the offences between March 1972 and October 1989 . he denies all the charges, including two counts'},
 {'summary_text': 'a man receiving psychiatric treatment at the clinic threatened to shoot himself and others . the incident comes amid tension in Istanbul following several attacks in crowded'},
 {'summary_text': 'Gregor Townsend gave a debut to powerhouse wing Taqele Naiyaravoro . the dragons gave first starts of the season'},
 {'summary_te

### Sentiment Analysis

In [10]:
poem_dataset = load_dataset(
    'poem_sentiment',
    version='1.0.0',
    cache_dir=cache_dir
)

In [11]:
poem_sample = poem_dataset['train'].select(range(10))
display(poem_sample.to_pandas())

Unnamed: 0,id,verse_text,label
0,0,with pale blue berries. in these peaceful shad...,1
1,1,"it flows so long as falls the rain,",2
2,2,"and that is why, the lonesome day,",0
3,3,"when i peruse the conquered fame of heroes, an...",3
4,4,of inward strife for truth and liberty.,3
5,5,the red sword sealed their vows!,3
6,6,and very venus of a pipe.,2
7,7,"who the man, who, called a brother.",2
8,8,"and so on. then a worthless gaud or two,",0
9,9,to hide the orb of truth--and every throne,2


In [12]:
sentiment_classifier = pipeline(
    task='text-classification',
    model='nickwong64/bert-base-uncased-poems-sentiment',
    model_kwargs={'cache_dir': cache_dir}
)

In [13]:
results = sentiment_classifier(poem_sample['verse_text'])

In [14]:
# Display the sentiment

# Join the prediction with the ground-truth data
joined_data = pd.DataFrame.from_dict(results).rename({'label':'predicted_label'}, axis=1).join(
    pd.DataFrame(poem_sample).rename({'label':'true_label'}, axis=1)
)

# Change label indices to text labels
sentiment_labels = {
    0: 'negative',
    1: 'positive',
    2: 'no_impact',
    3: 'mixed',
}
joined_data = joined_data.replace({'true_label': sentiment_labels})

display(joined_data[['predicted_label', 'true_label', 'score', 'verse_text']])

Unnamed: 0,predicted_label,true_label,score,verse_text
0,positive,positive,0.996594,with pale blue berries. in these peaceful shad...
1,no_impact,no_impact,0.998741,"it flows so long as falls the rain,"
2,negative,negative,0.995966,"and that is why, the lonesome day,"
3,mixed,mixed,0.968735,"when i peruse the conquered fame of heroes, an..."
4,mixed,mixed,0.975967,of inward strife for truth and liberty.
5,mixed,mixed,0.96658,the red sword sealed their vows!
6,no_impact,no_impact,0.998639,and very venus of a pipe.
7,no_impact,no_impact,0.998611,"who the man, who, called a brother."
8,negative,negative,0.996557,"and so on. then a worthless gaud or two,"
9,no_impact,no_impact,0.998519,to hide the orb of truth--and every throne


### Translation

In [15]:
en_to_es_pipeline = pipeline(
    task='translation',
    model='Helsinki-NLP/opus-mt-en-es',
    model_kwargs={'cache_dir': cache_dir}
)



In [16]:
en_to_es_pipeline('I would really like to figure out what the shortest path to the trainstation is.')

[{'translation_text': 'Realmente me gustaría averiguar cuál es el camino más corto a la estación de tren.'}]

Multi Language Model T5

In [17]:
t5_small_pipeline = pipeline(
    task='text2text-generation',
    model='t5-small',
    max_length=50,
    model_kwargs={'cache_dir':cache_dir}
)

In [18]:
t5_small_pipeline("translate ENglish to French: I don't know I am a tourist")

[{'generated_text': 'Je ne sais pas que je suis touriste.'}]

In [19]:
t5_small_pipeline("translate English to German: Why is there straw lying here?")

[{'generated_text': 'Warum liegt hier Stroh?'}]

### Zero-shot learning

In [20]:
zero_shot_pipeline = pipeline(
    task='zero-shot-classification',
    model="cross-encoder/nli-deberta-v3-small",
    model_kwargs={"cache_dir":cache_dir}
)



In [21]:
def categorize_article(article: str) -> None:
  """
  This helper function defines the categories (labels) which the model must use to label articles.
  Note that our model was NOT fine-tuned to use these specific labels,
  but it "knows" what the labels mean from its more general training.

  This function prints out the predicted labels alongside their confidence scores.
  """
  candidate_labels=['politics', 'finance', 'sports', 'science and technology', 'pop culture', 'breaking news']

  results = zero_shot_pipeline(
      article,
      candidate_labels=candidate_labels

  )
  # Print the results nicely
  del results["sequence"]
  display(pd.DataFrame(results))

In [22]:
breaking_news_article='''
A bus has burst into flames on Sydney's famed Harbour Bridge, sparking chaos among morning commuters.

Emergency services rushed to extinguish the blaze reportedly began shortly after 8.30am on Wednesday.

NSW Police said the fire had since been extinguished, with motorists urged to continue to expect delays.
'''
categorize_article(breaking_news_article)

Unnamed: 0,labels,scores
0,breaking news,0.583204
1,politics,0.115529
2,sports,0.095644
3,pop culture,0.079336
4,finance,0.063435
5,science and technology,0.062853


In [23]:
politics_article='''
Jacinta Allan will become Victoria's 49th premier following the shock resignation of Daniel Andrews.
Allan was chosen for the role at a Labor caucus meeting at state parliament this afternoon.
Public Transport Minister Ben Carroll has been selected as the deputy premier after a late push from the Labor right faction.
READ MORE: Daniel Andrews' journey from anonymity to premier
Jacinta Allan addressed media shortly after being elected by Labor to be the next Victorian premier, with deputy Ben Carroll at her side.

Allan addressed media outside parliament shortly after the caucus meeting ended.
"I have been endorsed unanimously by my caucus colleagues to replace the fabulous Daniel Andrews as leader of the Victorian parliamentary Labor Party, and in turn follow in his footsteps as premier of Victoria," she said.
"It is such a deep honour and privilege to be in the position to be heading to Government House and to be sworn in as premier of Victoria.
"And can I say, 24 years ago, almost to the day, when I walked into this place as a much younger woman from regional Victoria, I never expected to have this length of service or indeed to be able to have had the honour and privilege of serving the Victorian community in various ministerial roles."
Allan took the opportunity to praise Andrews.
"There's been lots written in just the last 24 hours about Daniel and Daniel Andrews's leadership, his strength, his courage, his determination," she said.
"He's been driven by his values every single day he has served the Victorian community."
READ MORE: 'Nothing stuck to Teflon Dan during his reign of spin and sniping … Now Victoria needs a new start'
Jacinta Allan exits the caucus room after being elected unopposed.

Jacinta Allan has been the long-term deputy to Daniel Andrews.
Jacinta Allan has long been seen as the successor to Daniel Andrews. (Joe Armao/The Age)
Allan said both she and Carroll had been raised by ﻿"working people" and vowed to continue to work hard for Victorians.
"I pledge to the Victorian community that I will continue to work hard as I've done every single day as a Member of Parliament," she said.
"But I also come to this role with the opportunity to continue the strong reform agenda and also chart a future course for the state and in coming days and weeks I will have more to say about policy priorities that I will be bringing."
'''
categorize_article(politics_article)

Unnamed: 0,labels,scores
0,politics,0.25616
1,breaking news,0.170782
2,finance,0.147551
3,science and technology,0.14599
4,pop culture,0.145451
5,sports,0.134067


### Few-shot learning

In [24]:
# Limit the response length for few-shot learning
few_shot_pipeline = pipeline(
    task='text-generation',
    model='EleutherAI/gpt-neo-1.3B',
    max_new_tokens=10,
    model_kwargs={'cache_dir':cache_dir}
)

In [25]:
# Get the token ID for "###", which will us as the EOS token below
eos_token_id = few_shot_pipeline.tokenizer.encode("###")

In [26]:
# Without any examples, the model output is inconsistent and usually incorrect
results = few_shot_pipeline(
    """
    For each tweet, describe its sentiment:

    [Tweet]: "This new music video was incredible"
    [Sentiment]:""",
    eos_token_id=eos_token_id
)
print(results[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:21017 for open-end generation.



    For each tweet, describe its sentiment:

    [Tweet]: "This new music video was incredible"
    [Sentiment]: "Really cute"

    If the


In [27]:
# With only 1 example, the model may or may not get the answer right
results = few_shot_pipeline(
    """
    For each tweet, describe its sentiment:

    [Tweet]: "This is the link to the article."
    [Sentiment]: Neutral
    ###
    [Tweet]: "This new music video was incredible."
    [Sentiment]:""",
    eos_token_id=eos_token_id
)
print(results[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:21017 for open-end generation.



    For each tweet, describe its sentiment:

    [Tweet]: "This is the link to the article."
    [Sentiment]: Neutral
    ###
    [Tweet]: "This new music video was incredible."
    [Sentiment]: Positive
    ###
   


In [29]:
# With 1 example, for each sentiment, the model is more likely to understand
results = few_shot_pipeline(
    """
    For each tweet, describe its sentiment:
    [Tweet]: "I hate it when my phone battery dies."
    [Sentiment]: Negative
    ###
    [Tweet]: "This is the link to the article."
    [Sentiment]: Neutral
    ###
    [Tweet]: "My day has been great."
    [Sentiment]: Positive
    ###
    [Tweet]: "This new music video was incredible."
    [Sentiment]:""",
    eos_token_id=eos_token_id
)
print(results[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:21017 for open-end generation.



    For each tweet, describe its sentiment:
    [Tweet]: "I hate it when my phone battery dies."
    [Sentiment]: Negative
    ###
    [Tweet]: "This is the link to the article."
    [Sentiment]: Neutral
    ###
    [Tweet]: "My day has been great."
    [Sentiment]: Positive
    ###
    [Tweet]: "This new music video was incredible."
    [Sentiment]: Neutral
    ###
   
