In [4]:
# Initial imports
import os
import pandas as pd
from datetime import datetime, timedelta
from dotenv import load_dotenv
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from newsapi.newsapi_client import NewsApiClient
analyzer = SentimentIntensityAnalyzer()

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, reuters
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
from wordcloud import WordCloud
import re
from nltk.corpus import reuters
%matplotlib inline
lemmatizer = WordNetLemmatizer()

In [None]:
from nltk.corpus import stopwords, reuters
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import re
import matplotlib.pyplot as plt

# Code to download corpora
import nltk
nltk.download('reuters')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

lemmatizer = WordNetLemmatizer()

In [5]:
# Read your api key environment variable
load_dotenv()

# Create a newsapi client
newsapi = NewsApiClient(api_key=os.environ["news_api"])

In [None]:
current_date = pd.Timestamp("2020-12-07", tz="America/New_York").isoformat()
past_date = pd.Timestamp("2020-11-08", tz="America/New_York").isoformat()


# Fetch all the news about Bitcoin
BTC_headlines = newsapi.get_everything(
    q="bitcoin",
    from_param=str(past_date),
    to=str(current_date),
    language="en",
    page_size=100,
    sort_by="relevancy"
)

# Print total articles
print(f"Total articles about BTC: {BTC_headlines['totalResults']}")

# Show sample article
BTC_headlines["articles"][0]

In [7]:
# Fetch all the news about Ethereum
ETH_headlines = newsapi.get_everything(
    q="ethereum",
    from_param=str(past_date),
    to=str(current_date),
    language="en",
    page_size=100,
    sort_by="relevancy"
)

# Print total articles
print(f"Total articles about ETH: {ETH_headlines['totalResults']}")

# Show sample article
ETH_headlines["articles"][0]

NewsAPIException: {'status': 'error', 'code': 'unexpectedError', 'message': 'Something went wrong. Your request may be malformed - please check the params and try again.'}

In [89]:
# Create the Bitcoin sentiment scores DataFrame
BTC_sentiments = []

for article in BTC_headlines["articles"]:
    try:
        text = article["content"]
        date = article["publishedAt"][:10]
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]
        
        BTC_sentiments.append({
            "text": text,
            "date": date,
            "compound": compound,
            "positive": pos,
            "negative": neg,
            "neutral": neu
            
        })
        
    except AttributeError:
        pass
    
# Create DataFrame
BTC_df = pd.DataFrame(BTC_sentiments)

# Reorder DataFrame columns
cols = ["date", "compound","negative", "neutral", "positive", "text"]
BTC_df = BTC_df[cols]
#BTC_df = BTC_df.drop(columns='date')
BTC_df.head()

Unnamed: 0,date,compound,negative,neutral,positive,text
0,2020-11-12,-0.6705,0.199,0.737,0.064,A former Microsoft software engineer from Ukra...
1,2020-12-03,0.6369,0.0,0.838,0.162,Visa has partnered with cryptocurrency startup...
2,2020-11-12,0.2144,0.0,0.947,0.053,PayPal is bringing its newly-announced support...
3,2020-11-20,0.2023,0.0,0.95,0.05,"In November 2017, after an absolutely massive,..."
4,2020-12-06,0.0,0.0,1.0,0.0,"Unlike ‘conventional’ cryptocurrencies, a cent..."


In [90]:
# Create the Ethereum sentiment scores DataFrame
ETH_sentiments = []

for article in ETH_headlines["articles"]:
    try:
        text = article["content"]
        date = article["publishedAt"][:10]
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]
        
        ETH_sentiments.append({
            "text": text,
            "date": date,
            "compound": compound,
            "positive": pos,
            "negative": neg,
            "neutral": neu
            
        })
        
    except AttributeError:
        pass
    
# Create DataFrame
ETH_df = pd.DataFrame(ETH_sentiments)

# Reorder DataFrame columns
cols = ["date", "compound","negative", "neutral", "positive", "text"]
ETH_df = ETH_df[cols]
#BTC_df = BTC_df.drop(columns='date')
ETH_df.head()

Unnamed: 0,date,compound,negative,neutral,positive,text
0,2020-11-12,0.2144,0.0,0.947,0.053,PayPal is bringing its newly-announced support...
1,2020-11-23,0.0,0.0,1.0,0.0,FILE PHOTO: Representation of the Ethereum vir...
2,2020-11-23,0.0,0.0,1.0,0.0,FILE PHOTO: Representation of the Ethereum vir...
3,2020-11-23,0.4215,0.0,0.912,0.088,LONDON (Reuters) - Digital currencies Ethereum...
4,2020-11-19,0.8779,0.0,0.682,0.318,"PayPal has launched the Generosity Network, a ..."


In [91]:
# Describe the Bitcoin Sentiment
BTC_df.describe()

Unnamed: 0,compound,negative,neutral,positive
count,98.0,98.0,98.0,98.0
mean,0.160311,0.018786,0.923347,0.057867
std,0.334972,0.041537,0.07804,0.067627
min,-0.6705,0.0,0.682,0.0
25%,0.0,0.0,0.8585,0.0
50%,0.0,0.0,0.9455,0.05
75%,0.435675,0.0,1.0,0.12175
max,0.8779,0.215,1.0,0.318


In [92]:
# Describe the Ethereum Sentiment
ETH_df.describe()

Unnamed: 0,compound,negative,neutral,positive
count,96.0,96.0,96.0,96.0
mean,0.220124,0.022,0.902687,0.075323
std,0.372832,0.045732,0.088909,0.079855
min,-0.6705,0.0,0.653,0.0
25%,0.0,0.0,0.84975,0.0
50%,0.20835,0.0,0.913,0.074
75%,0.5106,0.0,1.0,0.1255
max,0.8834,0.196,1.0,0.347


In [10]:
print(f'Q: Which coin had the highest mean positive score?\nA:Surprisingly Ethereum had the higher mean positive of the two.\n')
print(f'Q: Which coin had the highest compound score?\nA:Again, I am surprised Ethereum had the higher compound score of the two.\n')
print(f'Q: Which coin had the highest positive score?\nA:Once again, Ethereum had the higher positive score of the two.')

Q: Which coin had the highest mean positive score?
A:Surprisingly Ethereum had the higher mean positive of the two.

Q: Which coin had the highest compound score?
A:Again, I am surprised Ethereum had the higher compound score of the two.

Q: Which coin had the highest positive score?
A:Once again, Ethereum had the higher positive score of the two.


In [98]:
# Complete the tokenizer function
def tokenizer(text):
    """Tokenizes text."""
    sentence_tokenized = [sent_tokenize(i) for i in raw_stories]
    # Create a list of the words
    words = []
    for line in sentence_tokenized:
        for word in line:
            word = word_tokenize(word)
        words.append(word)

    # Convert the words to lowercase
    
    # Remove the punctuation
    
    # Remove the stop words
    
    # Lemmatize Words into root words
    
    return tokens

NameError: name 'words' is not defined

In [None]:
# Complete the tokenizer function
def tokenizer(text):
    """Tokenizes text."""
    # get all raw stories
    
    # sentence tokenize stories
    sentence_tokenized = [sent_tokenize(i) for i in ETH_df['text']]
    # word tokenize all sentences
    word_tokenized = []

    for story in sentence_tokenized:
    # get all for each article, which is already sentence tokenized
        words = []
        for sent in story:
            words = words + word_tokenize(sent)
    # append all words for each article to the word_tokenized list
        word_tokenized.append(words)
    # Convert the words to lowercase
    for l in word_tokenized:
        for word in l:
            sw = set(stopwords.words('english'))
            regex = re.compile("[^a-zA-Z ]")
            re_clean = regex.sub('', article)
            words = word_tokenize(re_clean)
            output = [word.lower() for word in words if word.lower() not in sw]
            return output
    # Remove the punctuation

    # Remove the stop words
    
    # Lemmatize Words into root words

In [None]:
BTC_df['tokens'] = word_tokenized