In [1]:
import os
import json
import requests
import pandas as pd

# Import VADER Dependencies
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download/Update the VADER Lexicon
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Gallo\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
searches = ['American%20Politics','US%20economy', 'US%20stock%20market', 'US%20President','Cryptocurrency%20OR%20Bitcoin%20OR%20Ethereum']
size = 200
# ticles = {}
dictionary = {'Date': [], 'Category': [], 'Title': []}
for s in searches:
    guardian_url = f"https://content.guardianapis.com/search?q={s}&to-date=2020-12-31&from-date=2020-01-01&order-by=oldest&page-size={size}&api-key=2e00ffe0-1ee3-4914-90b1-b39eb782de0b"
    guardian_response = requests.get(guardian_url)
    guardian_response_json = guardian_response.json()
    list1 = guardian_response_json['response']['results']
    for i, a in enumerate(list1):
        date = list1[i]['webPublicationDate']
        name = list1[i]['webTitle']
        dictionary["Date"].append(date)
        dictionary["Title"].append(name)
        dictionary["Category"].append(s.replace("%20"," "))

In [3]:
clickbait_df = pd.DataFrame(dictionary)
clickbait_df.head()

Unnamed: 0,Date,Category,Title
0,2020-01-01T08:30:00Z,American Politics,The final sprint: will any of the Democratic c...
1,2020-01-01T09:01:00Z,American Politics,The Power of Bad and How to Overcome It review...
2,2020-01-01T10:00:02Z,American Politics,'I'm on the hunt for humour and hope': what wi...
3,2020-01-01T11:15:02Z,American Politics,There is no such thing as 'authentic' food. Ig...
4,2020-01-01T16:29:23Z,American Politics,Pete Buttigieg fundraising surges amid attacks...


In [4]:
# Create a new column to hold sentiment scores
Sentiment = []

for title in clickbait_df["Title"]:
    try:
        text = title
        sentiment = analyzer.polarity_scores(title)
        compound = sentiment["compound"]
        
        Sentiment.append(compound)
        
    except AttributeError:
        pass

sentiment_df = pd.DataFrame(Sentiment)
sentiment_df.columns = ['Compound']
sentiment_df.head()

Unnamed: 0,Compound
0,0.4767
1,-0.5423
2,0.4767
3,-0.5719
4,-0.4404


In [5]:
clickbait_df = pd.concat([clickbait_df, sentiment_df], axis="columns", join="inner")
clickbait_df.head()

Unnamed: 0,Date,Category,Title,Compound
0,2020-01-01T08:30:00Z,American Politics,The final sprint: will any of the Democratic c...,0.4767
1,2020-01-01T09:01:00Z,American Politics,The Power of Bad and How to Overcome It review...,-0.5423
2,2020-01-01T10:00:02Z,American Politics,'I'm on the hunt for humour and hope': what wi...,0.4767
3,2020-01-01T11:15:02Z,American Politics,There is no such thing as 'authentic' food. Ig...,-0.5719
4,2020-01-01T16:29:23Z,American Politics,Pete Buttigieg fundraising surges amid attacks...,-0.4404


In [6]:
american_politics_df = clickbait_df[clickbait_df["Category"]=='American Politics']
american_politics_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Date      200 non-null    object 
 1   Category  200 non-null    object 
 2   Title     200 non-null    object 
 3   Compound  200 non-null    float64
dtypes: float64(1), object(3)
memory usage: 7.8+ KB


In [7]:
us_economy_df = clickbait_df[clickbait_df["Category"]=='US economy']
us_economy_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 200 to 399
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Date      200 non-null    object 
 1   Category  200 non-null    object 
 2   Title     200 non-null    object 
 3   Compound  200 non-null    float64
dtypes: float64(1), object(3)
memory usage: 7.8+ KB


In [8]:
us_stockmarket_df = clickbait_df[clickbait_df["Category"]=='US stock market']
us_stockmarket_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 400 to 599
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Date      200 non-null    object 
 1   Category  200 non-null    object 
 2   Title     200 non-null    object 
 3   Compound  200 non-null    float64
dtypes: float64(1), object(3)
memory usage: 7.8+ KB


In [9]:
us_president_df = clickbait_df[clickbait_df["Category"]=='US President']
us_president_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 600 to 799
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Date      200 non-null    object 
 1   Category  200 non-null    object 
 2   Title     200 non-null    object 
 3   Compound  200 non-null    float64
dtypes: float64(1), object(3)
memory usage: 7.8+ KB


In [10]:
bitcoin_ethereum_df = clickbait_df[clickbait_df["Category"]=='Cryptocurrency OR Bitcoin OR Ethereum']
bitcoin_ethereum_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 127 entries, 800 to 926
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Date      127 non-null    object 
 1   Category  127 non-null    object 
 2   Title     127 non-null    object 
 3   Compound  127 non-null    float64
dtypes: float64(1), object(3)
memory usage: 5.0+ KB


In [11]:
# Get the shape of the dataframe
clickbait_df.shape

(927, 4)

In [12]:
# Get data information
clickbait_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 927 entries, 0 to 926
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Date      927 non-null    object 
 1   Category  927 non-null    object 
 2   Title     927 non-null    object 
 3   Compound  927 non-null    float64
dtypes: float64(1), object(3)
memory usage: 29.1+ KB


In [13]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [14]:
for Title, rows in clickbait_df.iterrows():
    doc = nlp(rows['Title'])
    print(doc)

The final sprint: will any of the Democratic candidates excite voters?
The Power of Bad and How to Overcome It review – professional Pollyannas
'I'm on the hunt for humour and hope': what will authors be reading in 2020?
There is no such thing as 'authentic' food. Ignore the purists | Mai Tran
Pete Buttigieg fundraising surges amid attacks from Warren and Sanders
Americans 'take democracy for granted', supreme court chief warns
Some political leaders find their natural authority in a crisis – not Scott Morrison | Katharine Murphy
Latin American female writers 'get literary place they deserve' in new collection
Palette cleansers: our photography, art and architecture picks for 2020
Star debuts and happy returns: theatre, dance and comedy in 2020
Centrist Democrats need a 2020 reality check before it's too late | Cas Mudde
We Are from Jazz review – zany Russian musical comedy
Brexit jitters weigh on pound, boosting Britain's blue-chip stocks - as it happened
Plan for landfill site near R