In [1]:
! pip install bs4
! pip install google
! pip install scrapy
! pip install nltk



In [2]:
import requests
import re
import os
import itertools
from bs4 import BeautifulSoup
from urllib.parse import urlparse

In [3]:
# query = input()
query = "What recent news state about possible energy price developments over the next three months?"

In [4]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import nltk
from collections import Counter
from dateutil.relativedelta import relativedelta
from datetime import datetime

nltk.download('punkt')
nltk.download('stopwords')

keywords_amount = 5
current_date = datetime.now()

# if f"next {x} month"

parsed_date = current_date + relativedelta(months=2)
formatted_date = parsed_date.strftime("%B %Y")

print(formatted_date)


def remove_special_characters(queryText):
    pattern = r'[^a-zA-Z0-9\s]'
    cleaned_string = re.sub(pattern,'',queryText)
    return cleaned_string


def prioritize_keywords(t, field_keywords):
    keywords_with_weights = {}

    for keyword in field_keywords:
        if keyword in freq_dist:
            keywords_with_weights[keyword] = freq_dist[keyword]*50

    return keywords_with_weights


text = remove_special_characters(query)

tokens = word_tokenize(text)

stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

freq_dist = FreqDist(filtered_tokens)

most_common = dict(freq_dist.most_common(keywords_amount))

print(most_common)


field_keywords = ['sustainable', 'stainless steel', 'recyclable', 'patent', 'energy', 'building', 'infrastructure', 'restructuring', 'measures', 'responsible', 'steel', 'efficient', 'carbon', 'clean', 'waste', 'green', 'environmental', 'innovation', 'ethical', 'materials', 'raw', 'industry', 'price', 'range', 'investment', 'money', 'inflation', 'trading', 'news']

keywords_with_weights = prioritize_keywords(remove_special_characters(query), field_keywords)
print(keywords_with_weights)

combined_keywords = dict(Counter(most_common) + Counter(keywords_with_weights))
print('ck', combined_keywords)

sorted_ck = dict(Counter(dict(sorted(combined_keywords.items(), key=lambda x: x[1], reverse=True))).most_common(keywords_amount))
print(sorted_ck)

keywordList = []

for keyword in sorted_ck:
    keywordList.append(keyword)

print(keywordList)

January 2024
{'recent': 1, 'news': 1, 'state': 1, 'possible': 1, 'energy': 1}
{'energy': 50, 'price': 50, 'news': 50}
ck {'recent': 1, 'news': 51, 'state': 1, 'possible': 1, 'energy': 51, 'price': 50}
{'news': 51, 'energy': 51, 'price': 50, 'recent': 1, 'state': 1}
['news', 'energy', 'price', 'recent', 'state']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
try:
    from googlesearch import search
except ImportError:
    print("No module named 'google' found")

generator = search(query, tld="co.in", num=6, stop=6, pause=2)
urls = []

for j in generator:
    urls.append(j)
    print(j)

index = 0
print(urls)

https://www.eia.gov/outlooks/steo/
https://www.eia.gov/outlooks/steo/report/electricity.php
https://www.nbcnews.com/business/business-news/israel-hamas-war-gas-prices-oil-rcna119817
https://www.bls.gov/news.release/ppi.nr0.htm
https://www.imf.org/en/Publications/fandd/issues/2022/12/bumps-in-the-energy-transition-yergin
https://www.weforum.org/agenda/2023/10/renewable-energy-stocks-are-struggling-plus-other-top-energy-stories-this-week/
['https://www.eia.gov/outlooks/steo/', 'https://www.eia.gov/outlooks/steo/report/electricity.php', 'https://www.nbcnews.com/business/business-news/israel-hamas-war-gas-prices-oil-rcna119817', 'https://www.bls.gov/news.release/ppi.nr0.htm', 'https://www.imf.org/en/Publications/fandd/issues/2022/12/bumps-in-the-energy-transition-yergin', 'https://www.weforum.org/agenda/2023/10/renewable-energy-stocks-are-struggling-plus-other-top-energy-stories-this-week/']


In [6]:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

website_names = [urlparse(url).netloc[4:] for url in urls]

print(website_names)

['eia.gov', 'eia.gov', 'nbcnews.com', 'bls.gov', 'imf.org', 'weforum.org']


In [7]:
from pathlib import Path

class MySpider(CrawlSpider):
    name = 'JunctionCrawling'
    allowed_domains = website_names
    start_urls = urls
    allowed = keywordList
    max_pages_per_website = 3
    crawled_pages_per_website = {}
    current_index = 0

    rules = (
        Rule(LinkExtractor(allow=allowed), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        website_name = urlparse(response.url).netloc[4:]
        print(website_name)

        try:
            _, _, files = next(os.walk("./" + website_name))
        except:
            self.crawled_pages_per_website[website_name] = 0
        else:
            self.crawled_pages_per_website[website_name] = len(files)
        print(self.crawled_pages_per_website[website_name])

        if self.crawled_pages_per_website[website_name] > self.max_pages_per_website:
            pass
        else:
            if all(count >= self.max_pages_per_website for count in self.crawled_pages_per_website.values()):
                self.crawler.engine.close_spider(self, 'Reached maximum pages per website')

            Path("./" + website_name).mkdir(parents=True, exist_ok=True)
            filename = "" + response.url.split('/')[-2] + '.html'
            file_path = "./" + website_name + "/" + filename
            body_content = response.xpath('//body').get()
            with open(file_path, 'w') as f:
                f.write(body_content)
                if website_name not in self.crawled_pages_per_website:
                    self.crawled_pages_per_website[website_name] = 1
                else:
                    self.crawled_pages_per_website[website_name] += 1




In [8]:
from scrapy.crawler import CrawlerProcess

process = CrawlerProcess(
    # settings={'CLOSESPIDER_PAGECOUNT': 100,'USER_AGENT': 'my-cool-project'}
    settings={'CLOSESPIDER_TIMEOUT': 20, 'USER_AGENT': 'SustainabLLM'}
)

INFO:scrapy.utils.log:Scrapy 2.11.0 started (bot: scrapybot)
2023-11-12 00:45:12 [scrapy.utils.log] INFO: Scrapy 2.11.0 started (bot: scrapybot)
INFO:scrapy.utils.log:Versions: lxml 4.9.3.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 22.10.0, Python 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0], pyOpenSSL 23.3.0 (OpenSSL 3.1.4 24 Oct 2023), cryptography 41.0.5, Platform Linux-5.15.120+-x86_64-with-glibc2.35
2023-11-12 00:45:12 [scrapy.utils.log] INFO: Versions: lxml 4.9.3.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 22.10.0, Python 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0], pyOpenSSL 23.3.0 (OpenSSL 3.1.4 24 Oct 2023), cryptography 41.0.5, Platform Linux-5.15.120+-x86_64-with-glibc2.35


In [9]:
process.crawl(MySpider)
process.start()

INFO:scrapy.addons:Enabled addons:
[]
2023-11-12 00:45:12 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)
DEBUG:scrapy.utils.log:Using reactor: twisted.internet.epollreactor.EPollReactor
2023-11-12 00:45:12 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
INFO:scrapy.extensions.telnet:Telnet Password: 284ee60458295c2f
2023-11-12 00:45:12 [scrapy.extensions.telnet] INFO: Telnet Password: 284ee60458295c2f
INFO:scrapy.middleware:Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.closespider.CloseSpider',
 'scrapy.extensions.logstats.LogStats']
2023-11-12 00:45:12 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scra

bls.gov
0
imf.org
0


2023-11-12 00:45:13 [scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite request to 'apnews.com': <GET https://apnews.com/article/asean-grid-renewable-energy-asia-40dcc02e27b130fc014e2b44fbf6aeb1>
DEBUG:scrapy.spidermiddlewares.offsite:Filtered offsite request to 'www.top1000funds.com': <GET https://www.top1000funds.com/2023/09/new-energy-system-not-transition-needed-to-reach-net-zero/>
2023-11-12 00:45:13 [scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite request to 'www.top1000funds.com': <GET https://www.top1000funds.com/2023/09/new-energy-system-not-transition-needed-to-reach-net-zero/>
DEBUG:scrapy.spidermiddlewares.offsite:Filtered offsite request to 'www.bloomberg.com': <GET https://www.bloomberg.com/news/articles/2023-10-03/france-pitches-plan-to-end-nuclear-energy-deadlock-with-germany>
2023-11-12 00:45:13 [scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite request to 'www.bloomberg.com': <GET https://www.bloomberg.com/news/articles/2023-10-03/france-pitch

bls.gov
1


DEBUG:scrapy.core.engine:Crawled (200) <GET https://www.bls.gov/schedule/2023/home.htm> (referer: https://www.bls.gov/news.release/ppi.nr0.htm)
2023-11-12 00:45:13 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.bls.gov/schedule/2023/home.htm> (referer: https://www.bls.gov/news.release/ppi.nr0.htm)
DEBUG:scrapy.core.engine:Crawled (200) <GET https://www.bls.gov/schedule/2023/11_sched.htm> (referer: https://www.bls.gov/news.release/ppi.nr0.htm)
2023-11-12 00:45:13 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.bls.gov/schedule/2023/11_sched.htm> (referer: https://www.bls.gov/news.release/ppi.nr0.htm)


imf.org
1
bls.gov
1


DEBUG:scrapy.downloadermiddlewares.redirect:Redirecting (301) to <GET https://www.bls.gov/cgi-bin/print.pl/news.release/ppi.nr0.htm> from <GET https://data.bls.gov/cgi-bin/print.pl/news.release/ppi.nr0.htm>
2023-11-12 00:45:14 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.bls.gov/cgi-bin/print.pl/news.release/ppi.nr0.htm> from <GET https://data.bls.gov/cgi-bin/print.pl/news.release/ppi.nr0.htm>
DEBUG:scrapy.core.engine:Crawled (200) <GET https://www.bls.gov/charts/producer-price-index/> (referer: https://www.bls.gov/news.release/ppi.nr0.htm)
2023-11-12 00:45:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.bls.gov/charts/producer-price-index/> (referer: https://www.bls.gov/news.release/ppi.nr0.htm)
DEBUG:scrapy.core.engine:Crawled (200) <GET https://www.nbcnews.com/information/nbc-news-info/contact-us-n1232521> (referer: https://www.nbcnews.com/business/business-news/israel-hamas-war-gas-prices-oil-rcna119817)
2023-11-12 00:45:14 [scr

bls.gov
2


DEBUG:scrapy.core.engine:Crawled (200) <GET https://www.eia.gov/todayinenergy/detail.php?id=60622&src=email> (referer: https://www.nbcnews.com/business/business-news/israel-hamas-war-gas-prices-oil-rcna119817)
2023-11-12 00:45:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.eia.gov/todayinenergy/detail.php?id=60622&src=email> (referer: https://www.nbcnews.com/business/business-news/israel-hamas-war-gas-prices-oil-rcna119817)


bls.gov
2
imf.org
2


INFO:scrapy.core.engine:Closing spider (Reached maximum pages per website)
2023-11-12 00:45:14 [scrapy.core.engine] INFO: Closing spider (Reached maximum pages per website)


bls.gov
3
nbcnews.com
0


DEBUG:scrapy.core.engine:Crawled (200) <GET https://www.bls.gov/news.release/ppi.htm> (referer: https://www.bls.gov/news.release/ppi.nr0.htm)
2023-11-12 00:45:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.bls.gov/news.release/ppi.htm> (referer: https://www.bls.gov/news.release/ppi.nr0.htm)
DEBUG:scrapy.core.engine:Crawled (200) <GET https://www.bls.gov/news.release/ppi.t01.htm> (referer: https://www.bls.gov/news.release/ppi.nr0.htm)
2023-11-12 00:45:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.bls.gov/news.release/ppi.t01.htm> (referer: https://www.bls.gov/news.release/ppi.nr0.htm)
DEBUG:scrapy.core.engine:Crawled (200) <GET https://www.bls.gov/news.release/ppi.t03.htm> (referer: https://www.bls.gov/news.release/ppi.nr0.htm)
2023-11-12 00:45:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.bls.gov/news.release/ppi.t03.htm> (referer: https://www.bls.gov/news.release/ppi.nr0.htm)
DEBUG:scrapy.core.engine:Crawled (200) <GET https://www.eia.gov

bls.gov
4
bls.gov
4


DEBUG:scrapy.core.engine:Crawled (200) <GET https://www.bls.gov/news.release/ppi.t02.htm> (referer: https://www.bls.gov/news.release/ppi.nr0.htm)
2023-11-12 00:45:15 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.bls.gov/news.release/ppi.t02.htm> (referer: https://www.bls.gov/news.release/ppi.nr0.htm)
DEBUG:scrapy.core.engine:Crawled (404) <GET https://www.bls.gov/cgi-bin/print.pl/news.release/ppi.nr0.htm> (referer: https://www.bls.gov/news.release/ppi.nr0.htm)
2023-11-12 00:45:15 [scrapy.core.engine] DEBUG: Crawled (404) <GET https://www.bls.gov/cgi-bin/print.pl/news.release/ppi.nr0.htm> (referer: https://www.bls.gov/news.release/ppi.nr0.htm)


bls.gov
4
nbcnews.com
1
eia.gov
0
bls.gov
4


DEBUG:scrapy.core.engine:Crawled (200) <GET https://www.nbcnews.com/nextstepsforveterans> (referer: https://www.nbcnews.com/business/business-news/israel-hamas-war-gas-prices-oil-rcna119817)
2023-11-12 00:45:16 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.nbcnews.com/nextstepsforveterans> (referer: https://www.nbcnews.com/business/business-news/israel-hamas-war-gas-prices-oil-rcna119817)
DEBUG:scrapy.core.engine:Crawled (200) <GET https://www.weforum.org/agenda/2023/09/iea-clean-energy-investment-global-warming/#:~:text=IEA%3A%20Clean%20energy%20investment%20must,World%20Economic%20Forum> (referer: https://www.weforum.org/agenda/2023/10/renewable-energy-stocks-are-struggling-plus-other-top-energy-stories-this-week/)
2023-11-12 00:45:16 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.weforum.org/agenda/2023/09/iea-clean-energy-investment-global-warming/#:~:text=IEA%3A%20Clean%20energy%20investment%20must,World%20Economic%20Forum> (referer: https://www.weforum.or

bls.gov
4
bls.gov
4
eia.gov
1
bls.gov
4


DEBUG:scrapy.downloadermiddlewares.redirect:Redirecting (301) to <GET https://www.nbcnews.com/information/nbc-news-info/about-nbc-news-digital-n1232178> from <GET https://www.nbcnews.com/pages/about-nbc-news-digital>
2023-11-12 00:45:17 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.nbcnews.com/information/nbc-news-info/about-nbc-news-digital-n1232178> from <GET https://www.nbcnews.com/pages/about-nbc-news-digital>
DEBUG:scrapy.core.engine:Crawled (200) <GET https://www.nbcnews.com/news/investigations/us-investigating-whether-iran-gave-advanced-training-hamas-militants-rcna119824> (referer: https://www.nbcnews.com/business/business-news/israel-hamas-war-gas-prices-oil-rcna119817)
2023-11-12 00:45:17 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.nbcnews.com/news/investigations/us-investigating-whether-iran-gave-advanced-training-hamas-militants-rcna119824> (referer: https://www.nbcnews.com/business/business-news/israel-hamas-war-gas-pri

nbcnews.com
2
weforum.org
0


DEBUG:scrapy.downloadermiddlewares.redirect:Redirecting (301) to <GET https://www.bls.gov/cgi-bin/print.pl/news.release/ppi.toc.htm> from <GET https://data.bls.gov/cgi-bin/print.pl/news.release/ppi.toc.htm>
2023-11-12 00:45:18 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.bls.gov/cgi-bin/print.pl/news.release/ppi.toc.htm> from <GET https://data.bls.gov/cgi-bin/print.pl/news.release/ppi.toc.htm>


nbcnews.com
2
nbcnews.com
3
nbcnews.com
4


DEBUG:scrapy.core.engine:Crawled (200) <GET https://www.nbcnews.com/archive> (referer: https://www.nbcnews.com/business/business-news/israel-hamas-war-gas-prices-oil-rcna119817)
2023-11-12 00:45:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.nbcnews.com/archive> (referer: https://www.nbcnews.com/business/business-news/israel-hamas-war-gas-prices-oil-rcna119817)
DEBUG:scrapy.downloadermiddlewares.redirect:Redirecting (301) to <GET https://www.nbcnews.com/information/nbc-news-info/closed-captioning-n1307063> from <GET https://www.nbcnews.com/info/closed-captioning>
2023-11-12 00:45:18 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.nbcnews.com/information/nbc-news-info/closed-captioning-n1307063> from <GET https://www.nbcnews.com/info/closed-captioning>
DEBUG:scrapy.core.engine:Crawled (200) <GET https://www.nbcnews.com/Israel-Hamas-war-Gaza-Strip-conflict> (referer: https://www.nbcnews.com/business/business-news/israel-hamas-war-gas-pr

nbcnews.com
4
nbcnews.com
4


INFO:scrapy.statscollectors:Dumping Scrapy stats:
{'downloader/request_bytes': 15997,
 'downloader/request_count': 41,
 'downloader/request_method_count/GET': 41,
 'downloader/response_bytes': 2094159,
 'downloader/response_count': 41,
 'downloader/response_status_count/200': 34,
 'downloader/response_status_count/301': 4,
 'downloader/response_status_count/302': 2,
 'downloader/response_status_count/404': 1,
 'dupefilter/filtered': 1033,
 'elapsed_time_seconds': 7.669788,
 'finish_reason': 'Reached maximum pages per website',
 'finish_time': datetime.datetime(2023, 11, 12, 0, 45, 19, 902956, tzinfo=datetime.timezone.utc),
 'httpcompression/response_bytes': 10456022,
 'httpcompression/response_count': 34,
 'log_count/DEBUG': 64,
 'log_count/INFO': 10,
 'memusage/max': 217989120,
 'memusage/startup': 217989120,
 'offsite/domains': 21,
 'offsite/filtered': 92,
 'request_depth_max': 2,
 'response_received_count': 34,
 'scheduler/dequeued': 41,
 'scheduler/dequeued/memory': 41,
 'scheduler

In [10]:
# rm *.html

In [11]:
# !rm -rf */

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# folder_names = os.listdir('./')
best_index_scores = {}
best_website_page = {}
best_text = ""

for website_name in set(website_names):
    fileStrings = []
    try:
        for filename in os.listdir('./' + website_name): # iterate over all files of given website
            f = os.path.join('./' + website_name, filename)
            # checking if it is a file
            if os.path.isfile(f):
                with open(f,'r') as htmlBody:
                    content = htmlBody.read()

                soup = BeautifulSoup(content, 'html.parser')
                tags = soup.find_all(['p', 'span'])

                text = ''.join([tag.get_text() for tag in tags])

                fileStrings.append(text)

        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(fileStrings)

        keyword_matrix = vectorizer.transform(keywordList)

        similarity_scores = cosine_similarity(keyword_matrix, tfidf_matrix, dense_output=True)

        best_match = np.mean(similarity_scores, axis=0)

        best_index = best_match.argmax() # index for the best page within website
        best_score = best_match.max()

        best_website_page[website_name] = fileStrings[best_index]
        best_text = best_text + best_website_page[website_name]

        best_index_scores[website_name] = best_score
    except:
        print(f'{website_name} was not scraped')

try:
    # best_text = best_website_page[max(best_index_scores, key=best_index_scores.get)]
    print(best_text)
except:
    best_text = ""
    print('error')




The IMF Press Center is a password-protected site for working journalists.
DANIEL YERGIN

                                    December 2022
                                Credit: Yevhen Lahunov | Istock 5 min(1403 words)Read
Download PDF 
Despite a growing global consensus, obstacles to reducing net carbon emissions to zero are stark  The global disruptions in energy markets and the war in Ukraine have added impetus to the push for renewable energy and the drive toward net-zero carbon emissions. Yet, even as the global consensus around the energy transition becomes stronger, the challenges to that transition are also becoming clearer. In addition to the uncertain pace of technological development and deployment, four issues in particular stand out:The need for energy security was a concern that had largely faded over the past several years. The energy shock, the economic hardship that ensued, skyrocketing energy prices that could not have been imagined 18 months ago, and geopolitical 

In [13]:
! pip install transformers
! pip install torch



In [14]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "sshleifer/distilbart-cnn-6-6"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
pipe = pipeline("summarization", model=model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

DEBUG:jaxlib.mlir._mlir_libs:Initializing MLIR with module: _site_initialize_0
2023-11-12 00:45:54 [jaxlib.mlir._mlir_libs] DEBUG: Initializing MLIR with module: _site_initialize_0
DEBUG:jaxlib.mlir._mlir_libs:Registering dialects from initializer <module 'jaxlib.mlir._mlir_libs._site_initialize_0' from '/usr/local/lib/python3.10/dist-packages/jaxlib/mlir/_mlir_libs/_site_initialize_0.so'>
2023-11-12 00:45:54 [jaxlib.mlir._mlir_libs] DEBUG: Registering dialects from initializer <module 'jaxlib.mlir._mlir_libs._site_initialize_0' from '/usr/local/lib/python3.10/dist-packages/jaxlib/mlir/_mlir_libs/_site_initialize_0.so'>
DEBUG:jax._src.path:etils.epath found. Using etils.epath for file I/O.
2023-11-12 00:45:54 [jax._src.path] DEBUG: etils.epath found. Using etils.epath for file I/O.
Level 1:tensorflow:Registering FakeQuantWithMinMaxArgs (<function _FakeQuantWithMinMaxArgsGradient at 0x78c3d2654940>) in gradient.
Level 1:tensorflow:Registering FakeQuantWithMinMaxVars (<function _FakeQuan

In [15]:
import torch

ARTICLE_TO_SUMMARIZE = (best_text)
inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="pt")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [16]:
summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=20, max_length=20)
tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

'Despite a growing global consensus, obstacles to reducing carbon emissions are stark . Yevhen'

In [17]:
import requests

API_URL = "https://api-inference.huggingface.co/models/sshleifer/distilbart-cnn-6-6"
headers = {"Authorization": "Bearer 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"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()

output = query({
	"inputs": best_text,
})

print('-----------------')
print(output[0]['summary_text'])

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api-inference.huggingface.co:443
2023-11-12 00:46:44 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (1): api-inference.huggingface.co:443
DEBUG:urllib3.connectionpool:https://api-inference.huggingface.co:443 "POST /models/sshleifer/distilbart-cnn-6-6 HTTP/1.1" 200 None
2023-11-12 00:46:48 [urllib3.connectionpool] DEBUG: https://api-inference.huggingface.co:443 "POST /models/sshleifer/distilbart-cnn-6-6 HTTP/1.1" 200 None


-----------------
Despite a growing global consensus, obstacles to reducing net carbon emissions to zero are stark . Yevhen Lahunov: The need for energy security was a concern that had largely faded over the past several years . The current global energy crisis did not start with the February 2022 invasion of Ukraine, he says .
