# General News Data Collection

## Objective

Collecting General news from various websites

1. general news is collected from 1st January 2018 till 27th Feb 2019 [1 Year] for all 10 CryptoCurrencies
2. Then Vader NLTK Library is used to extract sentiment based on the titles of the news

In [4]:
import requests
import json
import pprint
import pandas as pd
from dateutil import parser
import datetime

import sqlite3
import asyncio
import json

import aiohttp

from hn import ClientHN

conn = sqlite3.connect("CMPT733.db")
c = conn.cursor()

N = 10000


async def advance_run(loop):
    # We init the client - extension of aiohttp.ClientSession
    conn = aiohttp.TCPConnector(limit=1000, loop=loop)
    async with ClientHN(loop=loop, queue_size=1000, connector=conn, progress_bar=True, debug=True) as hn:
        # Download the last 1,000,000 stories
        hn_new_stories = await hn.last_n_items(n=N,last_id=17226718)
        with open("raw_news.json", "w") as f:
            json.dump(hn_new_stories, f)


if __name__ == '__main__':
    loop = asyncio.get_event_loop()
loop.run_until_complete(advance_run(loop))
print('\x1b[1;31m','Completed Downloading ',N,' news Articles! ', '\x1b[0m') # Sample run

Download last N posts (Estimation): 100%|██████████| 10000/10000 [00:10<00:00, 985.44it/s]
Problem with https://hacker-news.firebaseio.com/v0/item/17225904.json, Moving to DLQ


[1;31m Completed Downloading  10000  news Articles!  [0m


In [10]:
# Convert all json downloaded files to pandas df
def unpack_dictionary_columns(df, columns, fillna=None):
    ret = None
    for column in columns:
        if fillna is None:
            ret = pd.concat([df, pd.DataFrame((d for idx, d in df[column].iteritems()))], axis=1)
            del ret[column]
        else:
            ret = pd.concat([df, pd.DataFrame((d for idx, d in df[column].iteritems())).fillna(fillna)], axis=1)
            del ret[column]
            df = ret
    return ret

df = pd.read_json('news_files/1_billion_posts.json')
df = df[~df[0].isnull()]
df = unpack_dictionary_columns(df, [0], fillna=None)
df['kids'] = df['kids'].str.len()
df['parts'] = df['parts'].str.len()
df['time'] = pd.to_datetime(df['time'],unit='s')
df.to_sql('TBRawNews', con=conn, if_exists='append')
df.head(2)

Unnamed: 0,by,dead,deleted,descendants,id,kids,parent,parts,poll,score,text,time,title,type,url
0,_ph_,,,,19225716.0,1.0,19225496.0,,,,&gt; &gt; Your ability to self-host is not aff...,2019-02-22 14:46:02,,comment,
1,jjoonathan,,,,19225715.0,2.0,19225442.0,,,,Whether silence <i>should</i> mean tacit appro...,2019-02-22 14:45:56,,comment,


In [13]:
count = pd.read_sql("SELECT count(*) FROM TBRawNews", conn)
print('\x1b[1;31m Total number of News Articles downloaded \x1b[0m',count.values[0])

[1;31m Total number of News Articles downloaded [0m [6875000]


In [17]:
#Converting Titles to Sentiment Score
import nltk # be sure to have stopwords installed for this using nltk.download_shell()
import pandas as pd 
import string
import sqlite3
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from urllib.parse import urlparse
nltk.download('vader_lexicon')

conn = sqlite3.connect("CMPT733.db")
c = conn.cursor()


sia = SIA()
def extract_url(row):
    g = row.url
    url = urlparse(g).netloc
    return url

def get_score(row):
    if row.text is None:
        analyze_text = row.title
    else:
        analyze_text = row.title + ' ' + row.text
    return (sia.polarity_scores(analyze_text))


df = pd.read_sql("SELECT DISTINCT by, id, kids, parent, score, text,\
                 time, title, type, url FROM TBRawNews where parent is \
                 NULL and id is not NULL and title is not NULL ", conn)
sentiment = df.apply(get_score,1)
df=pd.concat([df,sentiment.apply(pd.Series)],1)
df['url'] = df.apply(extract_url,axis=1)
df_processed = df[['kids','score','time','title','type', 'url', 'compound', 'neg', 'neu', 'pos']].fillna(0)
df_processed.to_sql('TBProcessedGeneralNews', con=conn, if_exists='append')
df_processed.head(20)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/abejju/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,by,id,kids,parent,score,text,time,title,type,url,compound,neg,neu,pos
0,piyushmehta,18226692,,,1.0,,2018-10-16 05:07:41,SAFETY VALVES SUPPLIER DEALER EXPORTER AND MAN...,story,https://pressurereliefvalveatridhiman.blogspot...,0.4215,0.0,0.741,0.259
1,ghosthamlet,18226695,,,1.0,,2018-10-16 05:08:20,Preprocessing for deep learning: from covarian...,story,https://hadrienj.github.io/posts/Preprocessing...,0.0,0.0,1.0,0.0
2,pawelgrzybek,18226696,1.0,,1.0,,2018-10-16 05:08:23,PinBuddy – Google Chrome extension for the Pin...,story,https://pawelgrzybek.com/i-built-a-thing-pinbu...,0.0,0.0,1.0,0.0
3,hkai,18226694,,,1.0,,2018-10-16 05:08:17,From Party of Ideas to Party of Dittoheads,story,https://quillette.com/2018/10/15/from-party-of...,0.6597,0.0,0.526,0.474
4,leoharsha2,18226655,,,1.0,,2018-10-16 05:01:15,XRP: How a Technology Advancement Spiked a Cry...,story,https://medium.com/bexpro/xrp-how-a-technology...,0.0,0.0,1.0,0.0
5,zmzm92,18226625,,,1.0,,2018-10-16 04:53:27,وظائف شاعرة طبيب اسنان بالعين,story,https://www.uouo15.net/2018/10/blog-post_91.html,0.0,0.0,1.0,0.0
6,gscott,18226637,18.0,,126.0,,2018-10-16 04:55:15,"China cracks down on religion, crosses burned ...",story,http://www.abc.net.au/news/2018-09-25/crosses-...,0.0,0.0,1.0,0.0
7,em3rgent0rdr,18226626,1.0,,2.0,,2018-10-16 04:53:31,Jeff Bezos is just fine taking the Pentagon’s ...,story,https://techcrunch.com/2018/10/15/jeff-bezos-i...,0.2023,0.0,0.859,0.141
8,tareqak,18226631,9.0,,30.0,,2018-10-16 04:54:05,Google’s CEO Says Tests of Censored Chinese Se...,story,https://www.wired.com/story/wired-25-sundar-pi...,0.5423,0.102,0.637,0.261
9,piyushmehta,18226690,,,1.0,,2018-10-16 05:07:21,CARBON STEEL ANSI/ASME B 16.47 SERIES a FLANGE...,story,https://buttweldpipefitting.blogspot.com/2018/...,0.0,0.0,1.0,0.0


In [41]:
import pandas as pd
from dateutil import parser
import datetime
import time
import sqlite3
conn = sqlite3.connect("CMPT733.db")
c = conn.cursor()

df = pd.read_sql("SELECT * FROM TBProcessedGeneralNews", conn)
df['time'] = pd.to_datetime(df['time'],infer_datetime_format=True)

ap = df.set_index('time').groupby([pd.TimeGrouper('H')]).mean()
qp = df.set_index('time').groupby([pd.TimeGrouper('H')]).count()
ap = ap.reset_index()
qp = qp.reset_index()
def row_to_list(x,col_name):
    return list(x[col_name])

fp = df.set_index('time').groupby([pd.TimeGrouper('H')]).apply(lambda x: row_to_list(x,'title')).reset_index(name='title').dropna()
kp = df.set_index('time').groupby([pd.TimeGrouper('H')]).apply(lambda x: row_to_list(x,'url')).reset_index(name='url').dropna()
ap = ap[['time','score','compound','neg','neu','pos']]
qp = qp[['time','kids']]

result = pd.merge(ap,qp, how='inner',on=['time'])
result = pd.merge(result,fp, how='inner',on=['time'])
result = pd.merge(result,kp, how='inner',on=['time'])

result.to_json('processed_general_news.json',orient='records',date_format='iso')

  if sys.path[0] == '':
  del sys.path[0]


In [42]:
# read the processed data in json
import pandas as pd
from dateutil import parser
import datetime
import time
result = pd.read_json('processed_general_news.json')
result['time'] = pd.to_datetime(result['time'],infer_datetime_format=True)
result.head(5)

Unnamed: 0,compound,kids,neg,neu,pos,score,time,title,url
0,0.128304,57,0.03,0.850982,0.119018,6.754386,2017-09-27 20:00:00+00:00,"[Hacktoberfest 2017, 18 things only an Indie d...","[hacktoberfest.digitalocean.com, www.buildbox...."
1,0.060505,58,0.060103,0.852379,0.087517,4.689655,2017-09-27 21:00:00+00:00,[Introducing Akaunting: Free Accounting Softwa...,"[akaunting.com, futurism.com, www.bbc.co.uk, l..."
2,0.103068,47,0.056213,0.826766,0.117021,3.957447,2017-09-27 22:00:00+00:00,[US Senator sees Reddit as potential target fo...,"[thehill.com, www.facebook.com, www.npmjs.com,..."
3,0.051835,26,0.033923,0.870038,0.096038,5.653846,2017-09-27 23:00:00+00:00,[Predatory Malware Rendering Security Investme...,"[blog.vidder.com, www.theverge.com, soundcloud..."
4,-0.018833,30,0.0678,0.867733,0.064467,5.066667,2017-09-28 00:00:00+00:00,[National Parks Struggle with a Mounting Crisi...,"[www.nytimes.com, medium.com, code.visualstudi..."


## Data Distribution of General News Based on News Source

In [67]:
import numpy as np
import hvplot.pandas
analysis = pd.read_sql("SELECT * FROM TBProcessedGeneralNews", conn)
final_analysis = analysis[analysis.url != b''].groupby('url').index.count().reset_index()\
        .sort_values('index', ascending = False )\
        .head(20)
final_analysis.hvplot.bar('url','index',rot=45, width=800,\
                          height=400,xlabel='url',ylabel='no. of articles',
                         title='No. of articles per website')