In [1]:
!pip install -q transformers sentencepiece newspaper3k

[K     |████████████████████████████████| 3.4 MB 5.2 MB/s 
[K     |████████████████████████████████| 1.2 MB 54.3 MB/s 
[K     |████████████████████████████████| 211 kB 57.0 MB/s 
[K     |████████████████████████████████| 61 kB 433 kB/s 
[K     |████████████████████████████████| 3.3 MB 30.2 MB/s 
[K     |████████████████████████████████| 596 kB 34.0 MB/s 
[K     |████████████████████████████████| 895 kB 56.1 MB/s 
[K     |████████████████████████████████| 7.4 MB 35.6 MB/s 
[K     |████████████████████████████████| 81 kB 8.1 MB/s 
[K     |████████████████████████████████| 87 kB 5.7 MB/s 
[?25h  Building wheel for tinysegmenter (setup.py) ... [?25l[?25hdone
  Building wheel for feedfinder2 (setup.py) ... [?25l[?25hdone
  Building wheel for jieba3k (setup.py) ... [?25l[?25hdone
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone


In [2]:
import newspaper
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import datetime

In [3]:
CNN_News = newspaper.build('https://edition.cnn.com/', language='en', memoize_articles=False)

In [4]:
# Preform scrapping on this month's articles only (Top 10 articles for example)

# Get today's date
today = datetime.datetime.now()

articles=[]

# Loop over all the articles
for article in CNN_News.articles:
  article.download()
  article.parse()
  if (article.publish_date != None) and (len(article.text)>0) and (article.publish_date.month == today.month) and (article.publish_date.year == today.year) :
    # Adding the article, its publications date, and its url to our list of articles 
    articles.append([article.text, article.publish_date, article.url])

  # Get only the first 10 articles
  if len (articles) == 10 :
    break

In [5]:
# Text summarization using bart-large-cnn model from huggingface

# loading the pretrained model
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
 
for article in articles:
  # Passing the article's original text 
  inputs = tokenizer([article[0][:512]], return_tensors='pt')
  # Generate Summary
  summary_ids = model.generate(inputs['input_ids'] , early_stopping=True)
  summary = [tokenizer.decode(summary_id, skip_special_tokens=True, clean_up_tokenization_spaces=False) for summary_id in summary_ids][0]
  # Adding the summary to the list of each article
  article.append(summary)

Downloading:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [6]:
# Machine Translation for each article using mbart-large-50-many-to-many-mmt from huggingface

# loading the pretrained model
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

for article in articles :
  tokenizer.src_lang = "en_XX"
  tokenizer.dst_lang = "ar_AR"
  # Passing the artical's text summary
  encoded_ar = tokenizer(article[3][:512], return_tensors="pt")
  # Generate translation
  generated_tokens = model.generate(**encoded_ar, forced_bos_token_id=tokenizer.lang_code_to_id["ar_AR"])
  translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
  article.append(translation)

Downloading:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/649 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/529 [00:00<?, ?B/s]

In [7]:
print("The article is :", articles[7][0])
print("The article url is :", articles[7][2])
print("The article summary is :", articles[7][3])
print("The article summary translation is :",articles[7][4])

The article is : (CNN) Wisconsin men's basketball player Chris Vogt was traveling to play in a crucial matchup against Ohio State last Saturday when devastation ripped through his hometown of Mayfield, Kentucky.

"I was just stunned," Vogt told CNN Sports of seeing the extensive damage caused by the tornadoes that struck his home state late Friday.

A town of roughly 10,000 people, Mayfield was decimated by the twister, a storm that flattened homes and took the lives of at least 80 people in Kentucky.

"It's tough. I mean, this is my childhood. This is where I grew up. A lot of my friends still live here. I still call this place home," he continued.

"No one ever expects something like this to happen to their hometown."

Read More
The article url is : https://www.cnn.com/2021/12/18/sport/wisconsin-basketball-player-chris-vogt-raises-funds-spt-intl/index.html
The article summary is : Wisconsin men's basketball player Chris Vogt was traveling to play in a crucial matchup against Ohio Sta