In [26]:
# pip install --upgrade datasets

In [27]:
from datasets import load_dataset

In [28]:
from transformers import pipeline

### Text Summarization

In [29]:
xsum_data = load_dataset(
    "EdinburghNLP/xsum",
    revision="refs/convert/parquet"
)

In [30]:
xsum_data

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})

In [31]:
xsum_sample = xsum_data['train'].select(range(10))

xsum_sample

Dataset({
    features: ['document', 'summary', 'id'],
    num_rows: 10
})

In [32]:
xsum_sample.to_pandas()

Unnamed: 0,document,summary,id
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035
2,Ferrari appeared in a position to challenge un...,Lewis Hamilton stormed to pole position at the...,35951548
3,"John Edward Bates, formerly of Spalding, Linco...",A former Lincolnshire Police officer carried o...,36266422
4,Patients and staff were evacuated from Cerahpa...,An armed man who locked himself into a room at...,38826984
5,Simone Favaro got the crucial try with the las...,Defending Pro12 champions Glasgow Warriors bag...,34540833
6,"Veronica Vanessa Chango-Alverez, 31, was kille...",A man with links to a car that was involved in...,20836172
7,Belgian cyclist Demoitie died after a collisio...,Welsh cyclist Luke Rowe says changes to the sp...,35932467
8,"Gundogan, 26, told BBC Sport he ""can see the f...",Manchester City midfielder Ilkay Gundogan says...,40758845
9,The crash happened about 07:20 GMT at the junc...,A jogger has been hit by an unmarked police ca...,30358490


In [33]:
# create a pipeline 
summarization = pipeline(task='summarization',
                        model='t5-small',
                        min_length=20,
                        max_length=40,
                        truncation=True)

Device set to use cpu


In [34]:
summarization.model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [35]:
summarization.tokenizer

T5TokenizerFast(name_or_path='t5-small', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', '<extra_i

In [36]:
summarization(xsum_sample['document'][0])

[{'summary_text': 'the full cost of damage in Newton Stewart is still being assessed . many roads in peeblesshire remain badly affected by standing water . the water breached a retaining wall, flooding many commercial properties .'}]

In [37]:
result = summarization(xsum_sample["document"][0:12])

Your max_length is set to 200, but your input_length is only 194. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=97)
Your max_length is set to 200, but your input_length is only 140. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=70)


In [38]:
result

[{'summary_text': 'the full cost of damage in Newton Stewart is still being assessed . many roads in peeblesshire remain badly affected by standing water . the water breached a retaining wall, flooding many commercial properties .'},
 {'summary_text': 'a fire alarm went off at the Holiday Inn in Hope Street on Saturday . guests were asked to leave the hotel . the two buses were parked side-by-side in the car park .'},
 {'summary_text': 'Sebastian Vettel will start third ahead of team-mate Kimi Raikkonen . stewards only handed Hamilton a reprimand after governing body said "no clear instruction was given on where he should park" Mercedes were wary of Ferrari\'s pace before qualifying .'},
 {'summary_text': 'the 67-year-old is accused of committing the offences between March 1972 and October 1989 . he denies all the charges, including two counts of indecency with a child . the trial is expected to last two weeks .'},
 {'summary_text': 'a man receiving psychiatric treatment at the clinic 

In [39]:
import pandas as pd

In [40]:
generated_summary = pd.DataFrame(result)
generated_summary

Unnamed: 0,summary_text
0,the full cost of damage in Newton Stewart is s...
1,a fire alarm went off at the Holiday Inn in Ho...
2,Sebastian Vettel will start third ahead of tea...
3,the 67-year-old is accused of committing the o...
4,a man receiving psychiatric treatment at the c...
5,Gregor Townsend gave a debut to powerhouse win...
6,"Veronica Vanessa Chango-Alverez, 31, was kille..."
7,the 25-year-old was hit by a motorbike during ...
8,gundogan will not be fit for the start of the ...
9,the crash happened about 07:20 GMT at the junc...


In [41]:
sample_df = xsum_sample.to_pandas()
sample_df

Unnamed: 0,document,summary,id
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035
2,Ferrari appeared in a position to challenge un...,Lewis Hamilton stormed to pole position at the...,35951548
3,"John Edward Bates, formerly of Spalding, Linco...",A former Lincolnshire Police officer carried o...,36266422
4,Patients and staff were evacuated from Cerahpa...,An armed man who locked himself into a room at...,38826984
5,Simone Favaro got the crucial try with the las...,Defending Pro12 champions Glasgow Warriors bag...,34540833
6,"Veronica Vanessa Chango-Alverez, 31, was kille...",A man with links to a car that was involved in...,20836172
7,Belgian cyclist Demoitie died after a collisio...,Welsh cyclist Luke Rowe says changes to the sp...,35932467
8,"Gundogan, 26, told BBC Sport he ""can see the f...",Manchester City midfielder Ilkay Gundogan says...,40758845
9,The crash happened about 07:20 GMT at the junc...,A jogger has been hit by an unmarked police ca...,30358490


In [42]:
sample_df['generated_summary'] = generated_summary['summary_text']

In [43]:
sample_df

Unnamed: 0,document,summary,id,generated_summary
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142,the full cost of damage in Newton Stewart is s...
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035,a fire alarm went off at the Holiday Inn in Ho...
2,Ferrari appeared in a position to challenge un...,Lewis Hamilton stormed to pole position at the...,35951548,Sebastian Vettel will start third ahead of tea...
3,"John Edward Bates, formerly of Spalding, Linco...",A former Lincolnshire Police officer carried o...,36266422,the 67-year-old is accused of committing the o...
4,Patients and staff were evacuated from Cerahpa...,An armed man who locked himself into a room at...,38826984,a man receiving psychiatric treatment at the c...
5,Simone Favaro got the crucial try with the las...,Defending Pro12 champions Glasgow Warriors bag...,34540833,Gregor Townsend gave a debut to powerhouse win...
6,"Veronica Vanessa Chango-Alverez, 31, was kille...",A man with links to a car that was involved in...,20836172,"Veronica Vanessa Chango-Alverez, 31, was kille..."
7,Belgian cyclist Demoitie died after a collisio...,Welsh cyclist Luke Rowe says changes to the sp...,35932467,the 25-year-old was hit by a motorbike during ...
8,"Gundogan, 26, told BBC Sport he ""can see the f...",Manchester City midfielder Ilkay Gundogan says...,40758845,gundogan will not be fit for the start of the ...
9,The crash happened about 07:20 GMT at the junc...,A jogger has been hit by an unmarked police ca...,30358490,the crash happened about 07:20 GMT at the junc...


In [44]:
idx=1
print("full document \n", sample_df['document'][idx])
print()
print("original summary \n", sample_df['summary'][idx])
print()
print("generated summary \n", sample_df['generated_summary'][idx])

full document 
 A fire alarm went off at the Holiday Inn in Hope Street at about 04:20 BST on Saturday and guests were asked to leave the hotel.
As they gathered outside they saw the two buses, parked side-by-side in the car park, engulfed by flames.
One of the tour groups is from Germany, the other from China and Taiwan. It was their first night in Northern Ireland.
The driver of one of the buses said many of the passengers had left personal belongings on board and these had been destroyed.
Both groups have organised replacement coaches and will begin their tour of the north coast later than they had planned.
Police have appealed for information about the attack.
Insp David Gibson said: "It appears as though the fire started under one of the buses before spreading to the second.
"While the exact cause is still under investigation, it is thought that the fire was started deliberately."

original summary 
 Two tourist buses have been destroyed by fire in a suspected arson attack in Belf

### **Sentiment Analysis**

In [45]:
poem_dataset = load_dataset('poem_sentiment', split='train')

In [46]:
sentiment_classifier = pipeline(task='text-classification',
                     model='nickwong64/bert-base-uncased-poems-sentiment')

Device set to use cpu


In [47]:
poem_sample = poem_dataset.select(range(10))

sample_df=poem_sample.to_pandas()

sample_df

Unnamed: 0,id,verse_text,label
0,0,with pale blue berries. in these peaceful shad...,1
1,1,"it flows so long as falls the rain,",2
2,2,"and that is why, the lonesome day,",0
3,3,"when i peruse the conquered fame of heroes, an...",3
4,4,of inward strife for truth and liberty.,3
5,5,the red sword sealed their vows!,3
6,6,and very venus of a pipe.,2
7,7,"who the man, who, called a brother.",2
8,8,"and so on. then a worthless gaud or two,",0
9,9,to hide the orb of truth--and every throne,2


In [48]:
result = sentiment_classifier(list(poem_sample['verse_text']))
result

  return forward_call(*args, **kwargs)


[{'label': 'positive', 'score': 0.9965937733650208},
 {'label': 'no_impact', 'score': 0.9987409710884094},
 {'label': 'negative', 'score': 0.995965838432312},
 {'label': 'mixed', 'score': 0.9687354564666748},
 {'label': 'mixed', 'score': 0.975967526435852},
 {'label': 'mixed', 'score': 0.9665797352790833},
 {'label': 'no_impact', 'score': 0.9986388087272644},
 {'label': 'no_impact', 'score': 0.9986108541488647},
 {'label': 'negative', 'score': 0.9965572357177734},
 {'label': 'no_impact', 'score': 0.9985186457633972}]

In [49]:
prediction = pd.DataFrame(result)

prediction

Unnamed: 0,label,score
0,positive,0.996594
1,no_impact,0.998741
2,negative,0.995966
3,mixed,0.968735
4,mixed,0.975968
5,mixed,0.96658
6,no_impact,0.998639
7,no_impact,0.998611
8,negative,0.996557
9,no_impact,0.998519


In [50]:
sample_df

Unnamed: 0,id,verse_text,label
0,0,with pale blue berries. in these peaceful shad...,1
1,1,"it flows so long as falls the rain,",2
2,2,"and that is why, the lonesome day,",0
3,3,"when i peruse the conquered fame of heroes, an...",3
4,4,of inward strife for truth and liberty.,3
5,5,the red sword sealed their vows!,3
6,6,and very venus of a pipe.,2
7,7,"who the man, who, called a brother.",2
8,8,"and so on. then a worthless gaud or two,",0
9,9,to hide the orb of truth--and every throne,2


In [51]:
sample_df['label'].value_counts()

label
2    4
3    3
0    2
1    1
Name: count, dtype: int64

In [52]:
sentiment_labels = {0:'negative',1:'positive',2:'no_impact',3:'mixed'}

sample_df['new_labels']=sample_df['label'].map(sentiment_labels)

sample_df

Unnamed: 0,id,verse_text,label,new_labels
0,0,with pale blue berries. in these peaceful shad...,1,positive
1,1,"it flows so long as falls the rain,",2,no_impact
2,2,"and that is why, the lonesome day,",0,negative
3,3,"when i peruse the conquered fame of heroes, an...",3,mixed
4,4,of inward strife for truth and liberty.,3,mixed
5,5,the red sword sealed their vows!,3,mixed
6,6,and very venus of a pipe.,2,no_impact
7,7,"who the man, who, called a brother.",2,no_impact
8,8,"and so on. then a worthless gaud or two,",0,negative
9,9,to hide the orb of truth--and every throne,2,no_impact


In [53]:
sample_df['prediction'] = prediction['label']

In [54]:
sample_df

Unnamed: 0,id,verse_text,label,new_labels,prediction
0,0,with pale blue berries. in these peaceful shad...,1,positive,positive
1,1,"it flows so long as falls the rain,",2,no_impact,no_impact
2,2,"and that is why, the lonesome day,",0,negative,negative
3,3,"when i peruse the conquered fame of heroes, an...",3,mixed,mixed
4,4,of inward strife for truth and liberty.,3,mixed,mixed
5,5,the red sword sealed their vows!,3,mixed,mixed
6,6,and very venus of a pipe.,2,no_impact,no_impact
7,7,"who the man, who, called a brother.",2,no_impact,no_impact
8,8,"and so on. then a worthless gaud or two,",0,negative,negative
9,9,to hide the orb of truth--and every throne,2,no_impact,no_impact


### **Translation**

In [55]:
# sacremoses is for the translation model `Helsinki-NLP/opus-mt-en-es`
# pip install sacremoses

In [56]:
# pip install transformers[sentencepiece]

In [57]:
en_to_es_translation_pipeline = pipeline(task='translation',
                                        model="Helsinki-NLP/opus-mt-en-es")

Device set to use cpu


In [58]:
en_to_es_translation_pipeline(
    'Hi, How are you?'
)

[{'translation_text': 'Hola, ¿cómo estás?'}]

In [59]:
t5_small_pipeline = pipeline(
    task='text2text-generation',
    model='t5-small',
    max_length=50
)

Device set to use cpu


In [60]:
t5_small_pipeline(
    'translate English to French : hi how are you?'
)

[{'generated_text': 'hi comment êtes-vous ?'}]

In [61]:
t5_small_pipeline(
    'translate English to Romanian : hi, how are you?'
)

[{'generated_text': 'oare, cum sunteţi?'}]

In [62]:
en_to_hi_translation_pipeline = pipeline(task='translation',
                                        model="Helsinki-NLP/opus-mt-en-hi")

Device set to use cpu


In [63]:
en_to_hi_translation_pipeline(
    'hi, how are you?'
)

[{'translation_text': 'हाय, तुम कैसे हो?'}]

### **Zero shot classification**

In [64]:
# step 1 load the data or pass input text

input_text = """sample input text"""

In [65]:
article1 = """
Mark Bennett sealed victory with a dazzling solo try in the dying moments, adding to earlier scores from Blair Kinghorn, Pierre Schoeman, and Ben Vellacott.
Connacht’s points came from tries by Bundee Aki and Mack Hansen, but the Irish side fell just short despite a spirited display.

Edinburgh looked sharper in the second half, overturning a narrow deficit after the break. 
Home coach Sean Everitt welcomed back flanker Hamish Watson from injury, while debutant Argentine winger Santiago Carreras added flair to the backline. 
Connacht handed a first start to scrum-half Colm Reilly and recalled experienced lock Ultan Dillane.

The visitors opened the scoring through a Jack Carty penalty, but Kinghorn’s quick hands sent Vellacott darting over in the corner on 15 minutes. 
Aki responded by powering through two tackles to score for Connacht, with Carty adding the extras. Schoeman restored Edinburgh’s lead with a close-range burst before half-time, making it 12-10.

The second half saw both sides trade blows. Hansen finished a sweeping Connacht move to nudge the Irishmen ahead, but Kinghorn’s accurate boot kept the scoreboard ticking for Edinburgh.

With the match hanging in the balance at 18-17, Bennett produced a moment of magic — sidestepping two defenders and accelerating under the posts for the decisive score, converted by Kinghorn.

Connacht coach Pete Wilkins said: "It’s a tough one to take — we played some fantastic rugby but let it slip at the crucial moments. 
Edinburgh’s accuracy in the final five minutes was the difference."
"""

In [66]:
# step 2 define the pipeline with task and model

zero_shot_pipeline = pipeline(task='zero-shot-classification',
                                   model='cross-encoder/nli-deberta-v3-small')

Device set to use cpu


In [67]:
# step 3 use zero_shot_pipeline to create candidate output labels and predicts on the input text

zero_shot_pipeline(article1, candidate_labels=["politics", "finance", 
                                              "sports", "science and technology", 
                                              "pop culture", "breaking news"])

{'sequence': '\nSimone Favaro got the crucial try with the last move of the game, following earlier touchdowns by Chris Fusaro, Zander Fagerson and Junior Bulumakau.\nRynard Landman and Ashton Hewitt got a try in either half for the Dragons.\nGlasgow showed far superior strength in depth as they took control of a messy match in the second period.\nHome coach Gregor Townsend gave a debut to powerhouse Fijian-born Wallaby wing Taqele Naiyaravoro, and centre Alex Dunbar returned from long-term injury, while the Dragons gave first starts of the season to wing Aled Brew and hooker Elliot Dee.\nGlasgow lost hooker Pat McArthur to an early shoulder injury but took advantage of their first pressure when Rory Clegg slotted over a penalty on 12 minutes.\nIt took 24 minutes for a disjointed game to produce a try as Sarel Pretorius sniped from close range and Landman forced his way over for Jason Tovey to convert - although it was the lock\'s last contribution as he departed with a chest injury sh

In [68]:
article2 = """
The full impact of India’s nail-biting win over Australia in Chennai is still being celebrated across the cricketing fraternity.
Ground staff at the MA Chidambaram Stadium are already preparing for the next fixture, while traffic congestion around Chepauk remained heavy well into the night as fans poured out after the thriller.
Some sections of the stadium require urgent refurbishment after the pitch invasion by overjoyed supporters.

Many local businesses, from tea stalls to jersey sellers, saw booming sales during the game.
Indian skipper Rohit Sharma praised the home crowd’s energy, calling it “a twelfth man” that helped the team stay motivated in tense moments.
The match turned dramatically in the final overs when Ravindra Jadeja’s quick-fire 28 and two late wickets sealed victory.

Rahul Mehta, owner of a popular fan café near the stadium, said the atmosphere was unlike anything he had experienced before.
He praised the security arrangements but felt more crowd control measures could have been put in place to prevent the post-match chaos.
“It’s fantastic for business, but we also have to think about safety — when Virat hit that six, people just started jumping over the barriers,” he said.

Meanwhile, cricket fever shows no sign of slowing down, with tickets for the upcoming India–Pakistan clash in Ahmedabad already sold out within minutes.
Former Indian captain Sunil Gavaskar called for better planning for such high-profile matches to ensure more fans can get access.
“The demand is huge, and we need to think beyond traditional stadium capacity — perhaps more fan parks and screening zones,” he suggested.

The Board of Control for Cricket in India (BCCI) has urged fans to buy tickets only through official channels to avoid scams.
Deputy BCCI secretary Rajeev Shukla emphasised that preparations for the Ahmedabad match were on schedule and promised a “festival-like” atmosphere.
He added, “We want to ensure that fans not only enjoy great cricket but also have a safe and smooth experience.”

Have you attended any of India’s recent matches?
Share your stories, photos, and videos with us at cricketfans@bcci.in or tweet using #BleedBlue.
"""

In [69]:
zero_shot_pipeline(article2, candidate_labels=["politics", "finance", 
                                              "sports", "science and technology", 
                                              "pop culture", "breaking news"])

 'labels': ['breaking news',
  'politics',
  'pop culture',
  'science and technology',
  'sports',
  'finance'],
 'scores': [0.20821072161197662,
  0.1737898737192154,
  0.17375336587429047,
  0.15718066692352295,
  0.15456236898899078,
  0.1325029581785202]}

### **Search and sampling inference**

In [70]:
print('sample article\n', xsum_sample['document'][0])

sample article
 The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.
Repair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing water.
Trains on the west coast mainline face disruption due to damage at the Lamington Viaduct.
Many businesses and householders were affected by flooding in Newton Stewart after the River Cree overflowed into the town.
First Minister Nicola Sturgeon visited the area to inspect the damage.
The waters breached a retaining wall, flooding many commercial properties on Victoria Street - the main shopping thoroughfare.
Jeanette Tate, who owns the Cinnamon Cafe which was badly affected, said she could not fault the multi-agency response once the flood hit.
However, she said more preventative work could have been carried out to ensure the retaining wall did not fail.
"It is difficult but I do think there is so much publicity for Dumfries and the Nith - and I totally appreciate tha

In [71]:
import time

In [72]:
%time summarization(xsum_sample['document'][0])

CPU times: total: 9.55 s
Wall time: 4.89 s


[{'summary_text': 'the full cost of damage in Newton Stewart is still being assessed . many roads in peeblesshire remain badly affected by standing water . the water breached a retaining wall, flooding many commercial properties .'}]

In [78]:
%time summarization(xsum_sample['document'][0],num_beams=100)

CPU times: total: 1min 56s
Wall time: 2min


[{'summary_text': 'many businesses and householders were affected by flooding in Newton Stewart . the water breached a retaining wall, flooding many commercial properties . a flood alert remains in place across the Borders because of the constant rain .'}]

In [76]:
%time summarization(xsum_sample['document'][0], do_sample=True)

CPU times: total: 11.3 s
Wall time: 8.4 s


[{'summary_text': 'many businesses and householders were affected by flooding in Newton Stewart . the water breached a retaining wall, flooding many commercial properties . a flood alert remains in place across the Borders because of the constant rain .'}]

In [77]:
%time summarization(xsum_sample['document'][0],do_sample=True, top_k=10, top_p=0.8)

CPU times: total: 13.5 s
Wall time: 7.72 s


[{'summary_text': 'the full cost of damage in Newton Stewart is still being assessed . many roads in peeblesshire remain badly affected by standing water . the water breached a retaining wall, flooding many commercial properties . a flood alert remains in place across the Borders because of constant rain .'}]