In [1]:
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import EnglishStemmer
from flair.models import TextClassifier
from flair.data import Sentence
from datetime import datetime, timedelta, timezone
from GoogleNews import GoogleNews
from newspaper import Article
import requests
import nltk
from fake_useragent import UserAgent
import ssl

In [12]:
stemmer = EnglishStemmer()
sia = TextClassifier.load('en-sentiment')

def smart_tokenize_and_preprocess(text):
    words = word_tokenize(text)
    result = [stemmer.stem(token.lower()) for token in words]
    return result

def flair_prediction(x):
    # x = smart_tokenize_and_preprocess(x)
    sentence = Sentence(x)
    sia.predict(sentence)
    score = sentence.labels[0]
    if "POSITIVE" in str(score):
        return "POSITIVE"
    elif "NEGATIVE" in str(score):
        return "NEGATIVE"
    else:
        return "NEUTRAL"

In [2]:
keywords = ['xrp',
            'XRP',
            'Ripple Price Prediction',
            'XRP prediction']

In [3]:
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('punkt')
user_agent = UserAgent() 


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nicolasasmann/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:

def get_news(period, path):
    
    score = 0
    
    headers = {'User-Agent': user_agent.random, "Upgrade-Insecure-Requests": "1","DNT": "1","Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8","Accept-Language": "en-US,en;q=0.5","Accept-Encoding": "gzip, deflate"}
    googlenews=GoogleNews(lang="en")
    googlenews.set_period(period)
    print("Scraping started.")
    index = 0
    scraped = []
    data_to_save = []
    titles = []
    for key in keywords:
        googlenews.get_news(key=key)
        articles = googlenews.result()
        for article in articles:
            title = article["title"]
            if title in titles:
                continue
            titles.append(title)
            url = article["link"]
            published_date = article["datetime"]
            try:
                r = requests.get(f"https://{url}", timeout=15, headers=headers) 
                url = r.url
            except Exception:
                continue
            if not url in scraped:
                news_article = Article(url=url, language="en")
                try:
                    news_article.download()
                    news_article.parse()
                    news_article.nlp()
                    for c_key in keywords:
                        if c_key in news_article.text:
                            text = news_article.text.replace("\n\n", "\n")
                            data_to_save.append(f"{published_date}\n{news_article.title}\n{text}")
                            
                            verdict = flair_prediction(text)
                            if verdict == "POSITIVE": score += 1
                            elif verdict == "NEGATIVE": score -= 1
                            print(verdict)
                            
                            index += 1
                            print(f"Sources scraped: {index}")
                            break
                    scraped.append(url)
                except Exception:
                    pass
    
    data_to_save=list(set(data_to_save))
    
    with open(path, "w", encoding="utf-8") as f:
        f.write("\n\n\n".join(data_to_save))
    score = score/index
    print("RESULTS")
    print(f"Score Results: {score}")
    

In [19]:
path_txt = input("Path to save data to [.txt]: ")
date = input("Period of news, e.g. [2h] = 2hours or [3d] = 3days: ")
get_news(period=date, path=path_txt)
print(f"Data saved to {path_txt}")

Scraping started.
NEGATIVE
Sources scraped: 1
NEGATIVE
Sources scraped: 2
POSITIVE
Sources scraped: 3
NEGATIVE
Sources scraped: 4
NEGATIVE
Sources scraped: 5
NEGATIVE
Sources scraped: 6
NEGATIVE
Sources scraped: 7
NEGATIVE
Sources scraped: 8
NEGATIVE
Sources scraped: 9
NEGATIVE
Sources scraped: 10
NEGATIVE
Sources scraped: 11
NEGATIVE
Sources scraped: 12
POSITIVE
Sources scraped: 13
NEGATIVE
Sources scraped: 14
NEGATIVE
Sources scraped: 15
NEGATIVE
Sources scraped: 16
POSITIVE
Sources scraped: 17
NEGATIVE
Sources scraped: 18
POSITIVE
Sources scraped: 19
POSITIVE
Sources scraped: 20
NEGATIVE
Sources scraped: 21
NEGATIVE
Sources scraped: 22
POSITIVE
Sources scraped: 23
NEGATIVE
Sources scraped: 24
NEGATIVE
Sources scraped: 25
NEGATIVE
Sources scraped: 26
NEGATIVE
Sources scraped: 27
POSITIVE
Sources scraped: 28
NEGATIVE
Sources scraped: 29
NEGATIVE
Sources scraped: 30
POSITIVE
Sources scraped: 31
NEGATIVE
Sources scraped: 32
POSITIVE
Sources scraped: 33
POSITIVE
Sources scraped: 34
NEGAT