In [33]:
# !pip install datasets
# !pip install transformers
# !pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/1.3 MB[0m [31m15.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [35]:
import os

import pandas as pd

from datasets import load_dataset
from transformers import pipeline

In [34]:
from google.colab import drive
drive.mount('/content/drive')
PATH = '/content/drive/MyDrive/Google Colab/Datastax LLM/Chapter 1.18'

Mounted at /content/drive


In [20]:
# Define th cache directory
cache_dir = './cache'

### Download the dataset

In [21]:
xsum_dataset = load_dataset(
    "xsum",
    version ="1.2.0",
    cache_dir=cache_dir, # Define a download folder to cache the dataset
)
xsum_dataset

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})

In [22]:
xsum_sample = xsum_dataset['train'].select(range(10))
display(xsum_sample.to_pandas())

Unnamed: 0,document,summary,id
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035
2,Ferrari appeared in a position to challenge un...,Lewis Hamilton stormed to pole position at the...,35951548
3,"John Edward Bates, formerly of Spalding, Linco...",A former Lincolnshire Police officer carried o...,36266422
4,Patients and staff were evacuated from Cerahpa...,An armed man who locked himself into a room at...,38826984
5,Simone Favaro got the crucial try with the las...,Defending Pro12 champions Glasgow Warriors bag...,34540833
6,"Veronica Vanessa Chango-Alverez, 31, was kille...",A man with links to a car that was involved in...,20836172
7,Belgian cyclist Demoitie died after a collisio...,Welsh cyclist Luke Rowe says changes to the sp...,35932467
8,"Gundogan, 26, told BBC Sport he ""can see the f...",Manchester City midfielder Ilkay Gundogan says...,40758845
9,The crash happened about 07:20 GMT at the junc...,A jogger has been hit by an unmarked police ca...,30358490


### Summarization

In [23]:
# Load pipeline
summarizer = pipeline(
    task="summarization",
    model='t5-small',
    min_length=20,
    max_length=35,
    truncation=True,
    model_kwargs={'cache_dir': cache_dir})

In [24]:
# Apply to 1 article
summarizer(xsum_sample['document'][1])

[{'summary_text': 'a fire alarm went off at the Holiday Inn in Hope Street on Saturday . guests were asked to leave the hotel . the two buses have organised replacement coaches'}]

In [25]:
# Apply to a batch of articles
summarizer(xsum_sample['document'])

[{'summary_text': 'the full cost of damage in Newton Stewart is still being assessed . many roads in peeblesshire remain badly affected by standing water . a flood alert'},
 {'summary_text': 'a fire alarm went off at the Holiday Inn in Hope Street on Saturday . guests were asked to leave the hotel . the two buses have organised replacement coaches'},
 {'summary_text': 'Sebastian Vettel will start third ahead of team-mate Kimi Raikkonen . stewards only handed Hamilton a reprimand after '},
 {'summary_text': 'the 67-year-old is accused of committing the offences between March 1972 and October 1989 . he denies all the charges, including two counts'},
 {'summary_text': 'a man receiving psychiatric treatment at the clinic threatened to shoot himself and others . the incident comes amid tension in Istanbul following several attacks in crowded'},
 {'summary_text': 'Gregor Townsend gave a debut to powerhouse wing Taqele Naiyaravoro . the dragons gave first starts of the season'},
 {'summary_te

### Sentiment Analysis

In [26]:
poem_dataset = load_dataset(
    'poem_sentiment',
    version='1.0.0',
    cache_dir=cache_dir
)

In [27]:
poem_sample = poem_dataset['train'].select(range(10))
display(poem_sample.to_pandas())

Unnamed: 0,id,verse_text,label
0,0,with pale blue berries. in these peaceful shad...,1
1,1,"it flows so long as falls the rain,",2
2,2,"and that is why, the lonesome day,",0
3,3,"when i peruse the conquered fame of heroes, an...",3
4,4,of inward strife for truth and liberty.,3
5,5,the red sword sealed their vows!,3
6,6,and very venus of a pipe.,2
7,7,"who the man, who, called a brother.",2
8,8,"and so on. then a worthless gaud or two,",0
9,9,to hide the orb of truth--and every throne,2


In [28]:
sentiment_classifier = pipeline(
    task='text-classification',
    model='nickwong64/bert-base-uncased-poems-sentiment',
    model_kwargs={'cache_dir': cache_dir}
)

In [29]:
results = sentiment_classifier(poem_sample['verse_text'])

In [30]:
# Display the sentiment

# Join the prediction with the ground-truth data
joined_data = pd.DataFrame.from_dict(results).rename({'label':'predicted_label'}, axis=1).join(
    pd.DataFrame(poem_sample).rename({'label':'true_label'}, axis=1)
)

# Change label indices to text labels
sentiment_labels = {
    0: 'negative',
    1: 'positive',
    2: 'no_impact',
    3: 'mixed',
}
joined_data = joined_data.replace({'true_label': sentiment_labels})

display(joined_data[['predicted_label', 'true_label', 'score', 'verse_text']])

Unnamed: 0,predicted_label,true_label,score,verse_text
0,positive,positive,0.996594,with pale blue berries. in these peaceful shad...
1,no_impact,no_impact,0.998741,"it flows so long as falls the rain,"
2,negative,negative,0.995966,"and that is why, the lonesome day,"
3,mixed,mixed,0.968735,"when i peruse the conquered fame of heroes, an..."
4,mixed,mixed,0.975967,of inward strife for truth and liberty.
5,mixed,mixed,0.96658,the red sword sealed their vows!
6,no_impact,no_impact,0.998639,and very venus of a pipe.
7,no_impact,no_impact,0.998611,"who the man, who, called a brother."
8,negative,negative,0.996557,"and so on. then a worthless gaud or two,"
9,no_impact,no_impact,0.998519,to hide the orb of truth--and every throne


### Translation

In [32]:
en_to_es_translation_pipeline = pipeline(
    task='translation',
    model='Helsinki-NLP/opus-mt-en-es',
    model_kwargs={'cache_dir': cache_dir}
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

ValueError: ignored