# Unit 12 - Tales from the Crypto

---


## 1. Sentiment Analysis

Use the [newsapi](https://newsapi.org/) to pull the latest news articles for Bitcoin and Ethereum and create a DataFrame of sentiment scores for each coin.

Use descriptive statistics to answer the following questions:
1. Which coin had the highest mean positive score?
2. Which coin had the highest negative score?
3. Which coin had the highest positive score?

In [67]:
# Initial imports
import os
import pandas as pd
%matplotlib inline

In [68]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
from newsapi.newsapi_client import NewsApiClient
from datetime import date, datetime, timedelta

from nltk.corpus import stopwords, reuters
from nltk.stem import WordNetLemmatizer


In [69]:
# Read your api key environment variable
newsapi = NewsApiClient(api_key=os.getenv('NEWS_API'))

In [70]:
# Create a newsapi client
from newsapi.newsapi_client import NewsApiClient
print(newsapi)

<newsapi.newsapi_client.NewsApiClient object at 0x000000EC20957048>


In [71]:
# Fetch the Bitcoin news articles
bitcoin_headlines = newsapi.get_everything(
    q="Bitcoin",
    language="en",
    page_size=100,
    sort_by="relevancy"
)

# Print total articles
print(f"Total articles about Bitcoin: {bitcoin_headlines['totalResults']}")

# Show sample article
bitcoin_headlines["articles"][0]

TypeError: expected string or bytes-like object

In [None]:
# Create the Bitcoin sentiment scores DataFrame
bitcoin_sentiments = []

for article in bitcoin_headlines["articles"]:
    try:
        text = article["content"]
        date = article["publishedAt"][:10]
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]
        
        bitcoin_sentiments.append({
            "text": text,
            "date": date,
            "compound": compound,
            "positive": pos,
            "negative": neg,
            "neutral": neu
            
        })
        
    except AttributeError:
        pass
    
# Create DataFrame
bitcoin_df = pd.DataFrame(bitcoin_sentiments)

# Reorder DataFrame columns
cols = ["date", "text", "compound", "positive", "negative", "neutral"]
bitcoin_df = bitcoin_df[cols]

bitcoin_df.tail(5)

In [None]:
# Create the ethereum sentiment scores DataFrame
ethereum_headlines = newsapi.get_everything(
    q="ethereum",
    language="en",
    page_size=100,
    sort_by="relevancy"
)

# Print total articles
print(f"Total articles about ethereum: {ethereum_headlines['totalResults']}")

# Show sample article
ethereum_headlines["articles"][0]

# Create the Bitcoin sentiment scores DataFrame
ethereum_sentiments = []

for article in ethereum_headlines["articles"]:
    try:
        text = article["content"]
        date = article["publishedAt"][:10]
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]
        
        ethereum_sentiments.append({
            "text": text,
            "date": date,
            "compound": compound,
            "positive": pos,
            "negative": neg,
            "neutral": neu
            
        })
        
    except AttributeError:
        pass
    
# Create DataFrame
ethereum_df = pd.DataFrame(ethereum_sentiments)

# Reorder DataFrame columns
cols = ["date", "text", "compound", "positive", "negative", "neutral"]
ethereum_df = ethereum_df[cols]

ethereum_df.head()

In [None]:
# Describe the Bitcoin Sentiment
bitcoin_df.describe()

In [None]:
# Describe the Ethereum Sentiment
eth_sentiment_df.describe()

### Questions:

Q: Which coin had the highest mean positive score?

A: BTC

Q: Which coin had the highest compound score?

A: BTC

Q. Which coin had the highest positive score?

A: BTC

---

## 2. Natural Language Processing
---
###   Tokenizer

In this section, you will use NLTK and Python to tokenize the text for each coin. Be sure to:
1. Lowercase each word.
2. Remove Punctuation.
3. Remove Stopwords.

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import re

In [None]:
# Instantiate the lemmatizer
lemmatizer = WordNetLemmatizer()

# Create a list of stopwords
# YOUR CODE HERE!

# Expand the default stopwords list if necessary
# YOUR CODE HERE!

In [None]:
# Complete the tokenizer function
def tokenizer(doc):
    # Create a list of the words
    # Convert the words to lowercase
    # Remove the punctuation
    # Remove the stop words
    # Lemmatize Words into root words
    sw = set(stopwords.words('english'))
    regex = re.compile("[^a-zA-Z ]")
    re_clean = regex.sub('', doc)
    words = word_tokenize(re_clean)
    lem = [lemmatizer.lemmatize(word) for word in words]
    output = [word.lower() for word in lem if word.lower() not in sw]
    return ', '.join(output)
#input_text = tokenizer(doc)
#input_text

In [None]:
# Create a new tokens column for Bitcoin
bitcoin_tokens = []
for i in range (0,100):
    try:
        doc=bitcoin_headlines["articles"][i]
        doc=doc['content']
        if pd.isna(doc):
            print('Empty') 
        else:        
            tokens = tokenizer(doc)
            bitcoin_tokens.append({"token": tokens })
    except AttributeError:
        pass

# Create DataFrame
bitcoin_df_tokens = pd.DataFrame(bitcoin_tokens)                    
bitcoin_df_comb=pd.concat([bitcoin_df, bitcoin_df_tokens],axis="columns", join="inner")

# Reorder DataFrame columns
cols = ["date", "compound", "positive", "negative", "neutral", "text", "token"]
bitcoin_df_comb = bitcoin_df_comb[cols]

bitcoin_df_comb.head()

In [None]:
# Create a new tokens column for Ethereum
ethereum_tokens = []
for i in range (0,100):
    try:
        doc=ethereum_headlines["articles"][i]
        doc=doc['content']
        if pd.isna(doc):
            print('Empty') 
        else:        
            tokens = tokenizer(doc)
            ethereum_tokens.append({"token": tokens })
    except AttributeError:
        pass

# Create DataFrame
ethereum_df_tokens = pd.DataFrame(ethereum_tokens)                    
ethereum_df_comb=pd.concat([ethereum_df, ethereum_df_tokens],axis="columns", join="inner")

# Reorder DataFrame columns
cols = ["date", "compound", "positive", "negative", "neutral", "text", "token"]
ethereum_df_comb = ethereum_df_comb[cols]

ethereum_df_comb.head()

---

### NGrams and Frequency Analysis

In this section you will look at the ngrams and word frequency for each coin. 

1. Use NLTK to produce the n-grams for N = 2. 
2. List the top 10 words for each coin. 

In [None]:
from collections import Counter
from nltk import ngrams
from nltk import bigrams

import nltk
from nltk.util import ngrams
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures


processed_ethereum = [ sub['token'] for sub in ethereum_tokens ] 
processed_bitcoin  = [ sub['token'] for sub in bitcoin_tokens ]

In [None]:
def tokenizer_clean(doc):
    sw = set(stopwords.words('english'))
    regex = re.compile("[^a-zA-Z ]")
    re_clean = regex.sub('', doc)
    words = word_tokenize(re_clean)
    lem = [lemmatizer.lemmatize(word) for word in words]
    output = [word.lower() for word in lem if word.lower() not in sw]
    return ' '.join(output)

In [None]:
# Generate the Bitcoin N-grams where N=2
processed3 = "bitcoin "
for i in range (0,95):
    tokensbc=tokenizer_clean(processed_bitcoin[i])
    #print(tokens)
    processed3= processed3 + tokensbc
type(processed3)

n = 2
bigramsbc = ngrams(processed3.split(), n)

gramsall=[]
for grams in bigramsbc:
    #print(grams)
    gramsall.append(grams)
    
word_fd = nltk.FreqDist(gramsall)
word_fd

In [72]:
# Generate the Ethereum N-grams where N=2
processed2 = "ethereum "
for i in range (0,95):
    tokens=tokenizer_clean(processed_ethereum[i])
    #print(tokens)
    processed2= processed2 + tokens
type(processed2)

n = 2
bigrams = ngrams(processed2.split(), n)

gramsall_et=[]
for grams in bigrams:
    #print(grams)
    gramsall_et.append(grams)
    
word_fd_eth = nltk.FreqDist(gramsall_et)
word_fd_eth

NameError: name 'processed_ethereum' is not defined

In [73]:
# Function token_count generates the top 10 words for a given coin
def token_count(tokens, N=3):
    """Returns the top N tokens from the frequency count"""
    return Counter(tokens).most_common(N)

In [74]:
# Use token_count to get the top 10 words for Bitcoin
top_10_token_btc = token_count(btc_tokens)
top_10_token_btc

NameError: name 'btc_tokens' is not defined

In [75]:
# Use token_count to get the top 10 words for Ethereum
top_10_token_eth = token_count(eth_tokens)
top_10_token_eth

NameError: name 'eth_tokens' is not defined

---

### Word Clouds

In this section, you will generate word clouds for each coin to summarize the news for each coin

In [76]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = [20.0, 10.0]

In [77]:
# Generate the Bitcoin word cloud
wc = WordCloud().generate(btc_tokens)
plt.imshow(wc)
plt.title('Bitcoin word cloud')

NameError: name 'btc_tokens' is not defined

In [78]:
# Generate the Ethereum word cloud
wc = WordCloud().generate(eth_tokens)
plt.imshow(wc)
plt.title('Ethereum word cloud')

NameError: name 'eth_tokens' is not defined

---
## 3. Named Entity Recognition

In this section, you will build a named entity recognition model for both Bitcoin and Ethereum, then visualize the tags using SpaCy.

In [79]:
import spacy
from spacy import displacy

In [80]:
# Download the language model for SpaCy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.1.0

2021-09-11 12:48:58.874152: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2021-09-11 12:48:58.874874: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.



  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.1.0
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [81]:
# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

---
### Bitcoin NER

In [82]:
# Concatenate all of the Bitcoin text together
bitcoin_text = []
for i in range (0,100):
    try:
        doc=bitcoin_headlines["articles"][i]
        doc=doc['content']
        if pd.isna(doc):
            print('Empty') 
        else:        
            bitcoin_text.append(doc)
            
    except AttributeError:
        pass
    
bitcoin_text= ', '.join(bitcoin_text)
regex = re.compile("[^a-zA-Z ]")
re_clean = regex.sub('', bitcoin_text)

NameError: name 'bitcoin_headlines' is not defined

In [None]:
# Run the NER processor on all of the text
NER_bit = nlp(re_clean)
NER_bit.user_data["title"] = "Bitcoin"

# Add a title to the document
doc.user_data["title"] = "Bitcoin NER"

In [None]:
# Render the visualization
displacy.render(NER_bit, style='ent')

In [None]:
# List all Entities
for ent in doc.ents:
    print('{} {}'.format(ent.text, ent.label_))

---

### Ethereum NER

In [None]:
# Concatenate all of the Ethereum text together
ethereum_text = []
for i in range (0,100):
    try:
        doc=ethereum_headlines["articles"][i]
        doc=doc['content']
        if pd.isna(doc):
            print('Empty') 
        else:        
            ethereum_text.append(doc)
            
    except AttributeError:
        pass
    
ethereum_text= ', '.join(ethereum_text)
regex = re.compile("[^a-zA-Z ]")
re_clean_eth = regex.sub('', ethereum_text)

In [None]:
# Run the NER processor on all of the text
NER_ether = nlp(re_clean_eth)
NER_ether.user_data["title"] = "Ethereum"

# Add a title to the document
doc.user_data["title"] = "Ethereum NER"

In [None]:
# Render the visualization
displacy.render(NER_ether, style='ent')

In [None]:
# List all Entities
print([ent.text for ent in NER_ethe.ents])

---