In [5]:
# Cache 120 sec = 2 min

import requests
import pandas as pd
import json
import time
import os
import datetime
from pandas.io.json import json_normalize
from cassandra import ConsistencyLevel
from cassandra.cluster import Cluster
from cassandra.query import SimpleStatement
from pytz import timezone

In [24]:
class fetchNewsCC:
    def __init__(self,url,api_key='',sort_order='Latest'):
        if api_key:
            url += '&api_key={}'.format(api_key)
        if sort_order:
            url += '&sortOrder={}'.format(sort_order)
        self.api_key = api_key
        self.url = url
        print("URL Created: " + self.url)
        
    def getJSON(self):
        req=requests.get(self.url)
        json_obj = req.json()
        return json_obj
    
    def formatPD(self,newsPD,col_name):
        newsPD = newsPD.rename(index=str, columns={"source_info.img": "source_info_img",\
                                           "source_info.name":"source_info_name","source_info.lang":"source_info_lang"})
        newsPD['tags'] = newsPD['tags'].apply(lambda x: x.split('|'))
        newsPD['categories'] = newsPD['categories'].apply(lambda x: x.split('|'))
        newsPD['published_on'] = newsPD['published_on'].apply(lambda x: self.convertToPstSeq(x))
        newsPD['source_info_name'] = newsPD['source_info_name'].fillna('unknown')
        newsPD['source_info_lang'] = newsPD['source_info_lang'].fillna('unknown')
        newsPD['source_info_img'] = newsPD['source_info_img'].fillna('unknown')
        newsPD['downvotes'] = newsPD['downvotes'].fillna(0)
        newsPD['guid'] = newsPD['guid'].fillna('unknown')
        newsPD['imageurl'] = newsPD['imageurl'].fillna('not_found')
        newsPD['lang'] = newsPD['lang'].fillna('EN')
        newsPD['source'] = newsPD['source'].fillna('unknown')
        newsPD['upvotes'] = newsPD['upvotes'].fillna(0)
        newsPD['url'] = newsPD['url'].fillna('not_found')
        return newsPD
    
    def jsonToPandas(self,json_obj):
        pd = json_normalize(json_obj)
        return pd
    
    def createSessionCassandra(self,key_space):
        self.cluster = Cluster(['127.0.0.1'],load_balancing_policy=None)
        self.session = self.cluster.connect()
        self.session.set_keyspace(key_space)
        
    def convertToPstSeq(self,inp_time):
        return datetime.datetime.fromtimestamp(int(inp_time)).astimezone(timezone('US/Pacific')).strftime('%Y%m%d%H%M%S')
        
    def checkTable(self,table_name):        
        self.session.execute("""CREATE TABLE IF NOT EXISTS """ + table_name + """ (
            body TEXT,
            categories list<text>,
            downvotes TEXT,
            guid TEXT,
            id INT,
            imageurl TEXT,
            lang TEXT,
            published_on DOUBLE,
            source TEXT,
            source_info_img TEXT,
            source_info_lang TEXT,
            source_info_name TEXT,
            tags list<text>,
            title TEXT,
            upvotes INT,
            url TEXT,
            sortmentainer INT,
            PRIMARY KEY(sortmentainer,published_on,id))
            WITH CLUSTERING ORDER BY (published_on DESC);
            """)        
    
    def pdToCassandra(self,table_name,pd,sorter):
        self.checkTable(table_name)
        query = """INSERT INTO """+ table_name +""" (
        sortmentainer,body,categories,downvotes,guid,id,imageurl,lang,
        published_on,source,source_info_img,source_info_lang,source_info_name,
        tags,title,upvotes,url) VALUES
        (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"""
        prepared = self.session.prepare(query)
        for index, df_iter in pd.iterrows():
            self.session.execute(prepared,\
                             (int(sorter),df_iter['body'], df_iter['categories'], df_iter['downvotes'], df_iter['guid'], int(df_iter['id']),\
                             df_iter['imageurl'], df_iter['lang'], float(df_iter['published_on']), df_iter['source'], df_iter['source_info_img'],\
                             df_iter['source_info_lang'], df_iter['source_info_name'],df_iter['tags'], df_iter['title'], int(df_iter['upvotes']),\
                             df_iter['url']))
        

In [25]:
api_key = '23489088ccc5e95cef763cbedd2d27588a979595edb097f53f40ad7d76239d41'
sortOrder = 'Latest'
categories = 'ALL_NEWS_CATEGORIES'
key_space = 'crypton'

In [26]:
#Fetch latest News
url = 'https://min-api.cryptocompare.com/data/v2/news/?lang=EN'
while (True):
    F2 = fetchNewsCC(url)
    jsonNews = F2.getJSON()
    # print(jsonNews)
    newsPD = F2.jsonToPandas(jsonNews['Data'])
    newsPD = F2.formatPD(newsPD,'published_on')
    F2.createSessionCassandra(key_space)
    F2.pdToCassandra('news_cc',newsPD,1)
    time.sleep(120)

URL Created: https://min-api.cryptocompare.com/data/v2/news/?lang=EN&sortOrder=Latest


AttributeError: 'float' object has no attribute 'encode'

In [None]:
from datetime import timedelta, date

list_datre_range = []
pattern = '%Y-%m-%d %H:%M:%S'

def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days)):
        yield start_date + timedelta(n)

start_date = date(2017, 6, 10)
end_date = date(2019, 2, 20)
for single_date in daterange(start_date, end_date):
    date_time = str(single_date.strftime("%Y-%m-%d") + ' 00:00:00')
    list_datre_range.append(int(time.mktime(time.strptime(date_time, pattern))))

In [None]:
# list_datre_range

url = 'https://min-api.cryptocompare.com/data/v2/news/?lang=EN&lTs='
for ts in list_datre_range:
    F2 = fetchNewsCC(url + str(ts))
    jsonNews = F2.getJSON()
    # print(jsonNews)
    newsPD = F2.jsonToPandas(jsonNews['Data'])
    newsPD = F2.formatPD(newsPD,'published_on')
    F2.createSessionCassandra(key_space)
    F2.pdToCassandra('news_cc',newsPD,0)

In [None]:
cluster = Cluster(['127.0.0.1'],load_balancing_policy=None)
session = cluster.connect()
session.set_keyspace(key_space) 

In [15]:
# Cassandra to CSV
def cassandraToCsv():
    query = "SELECT * FROM news_cc;"
    future = session.execute(query)
    
    return future
    
df = pd.DataFrame(list(cassandraToCsv()))
    
df.to_csv('NewsData.csv',index=False)