In [1]:
class YahooDailyReader():
	
	def __init__(self, symbol=None, start=None, end=None):
		import datetime, time
		self.symbol = symbol
		
		# initialize start/end dates if not provided
		if end is None:
			end = datetime.datetime.today()
		if start is None:
			start = datetime.datetime(2010,1,1)
		
		self.start = start
		self.end = end
		
		# convert dates to unix time strings
		unix_start = int(time.mktime(self.start.timetuple()))
		day_end = self.end.replace(hour=23, minute=59, second=59)
		unix_end = int(time.mktime(day_end.timetuple()))
		
		url = 'https://finance.yahoo.com/quote/{}/history?'
		url += 'period1={}&period2={}'
		url += '&filter=history'
		url += '&interval=1d'
		url += '&frequency=1d'
		self.url = url.format(self.symbol, unix_start, unix_end)
		
	def read(self):
		import requests, re, json
		import pandas as pd
		r = requests.get(self.url)
		
		ptrn = r'root\.App\.main = (.*?);\n}\(this\)\);'
		txt = re.search(ptrn, r.text, re.DOTALL).group(1)
		jsn = json.loads(txt)
		df = pd.DataFrame(
				jsn['context']['dispatcher']['stores']
				['HistoricalPriceStore']['prices']
				)
		df.insert(0, 'symbol', self.symbol)
		df['date'] = pd.to_datetime(df['date'], unit='s').dt.date
		
		# drop rows that aren't prices
		df = df.dropna(subset=['close'])
		
		df = df[['date','symbol', 'open', 'high', 'low',  'close', 'adjclose', 'volume']]
		colnames = ['Date', 'symbolid', 'Open', 'High', 'Low', 'Close', 'AdjClose', 'Volume']
		df.columns = colnames
		df = df.set_index('Date')
		return df


In [15]:
import pandas as pd
import datetime as dt
# read csv file of symbols
# you can use your own symbols for assets of interest
symbollist = pd.read_csv('nasdaqsymbol.csv')
symbollist = symbollist.Symbol

In [17]:
# create csv files for each of the symbols.
for symbol in symbollist:
    startdt, enddt = dt.datetime(1990, 1, 1), dt.datetime(2020, 6, 30)
    print(symbol)
    try:
        ydr = YahooDailyReader(symbol=symbol, start = startdt, end = enddt)
        data = ydr.read()
        data.to_csv('csvdata/' + symbol + '.csv')
    except:
        print(symbol + ' not done')

AAPL
ZS
ZSAN
ZUMZ
ZYNE


In [1]:
import pandas as pd 
import numpy as np 
import random  # used to select random stocks for sampling

In [2]:
# decide upon the time period of interest for generating buy signals
# Assume you can sell at the lows and buy at highs for that day only
# Assume some transaction cost say 50 bps
def long_returns(df, numdays):
	df['buyret'] = (df.Low / df.High.shift(numdays)-1)*100
	df.buyret.fillna(0, inplace=True)
	return df

# decide upon the time period of interest for generating sell signals
# Assume you can sell at the lows and buy at highs only
# Assume some transaction cost say 50 bps
def short_returns(df, numdays):
	df['sellret'] = (df.Low.shift(numdays) / df.High -1)*100
	df.sellret.fillna(0,inplace=True)
	return df

def label_data(df):
	df['Action'] = 'None'
	df.loc[df['buyret'] > 0.5, 'Action'] = 'Buy'
	df.loc[df['sellret'] > 0.5, 'Action'] = 'Sell'
	return df

In [3]:
# flexible function for computing moving average values
# normalise with variable that has the highest value
def moving_avg_data(df, mavnames, mavdays):
	if(len(mavnames) != len(mavdays)):
		print('Variable Names and Number of days must match')
		return
	
	for i in range(0,len(mavnames)):
		df[mavnames[i]] = df.AdjClose.rolling(window = mavdays[i]).mean()

	maxmovavg = mavnames[mavdays.index(max(mavdays))]
	mavnames.remove(maxmovavg)

	for i in range(0,len(mavnames)):
		df[mavnames[i]] = df[mavnames[i]] / df[maxmovavg]

	df.loc[:,maxmovavg] = 1
	df.drop(df.index[:max(mavdays)],inplace=True)
	return df

In [4]:
def create_datasets(csvfilename, sample_size):
	# choose random integers equal to sample_size to select stocks
	test_num = random.sample(range(0,len(symlist)-1), sample_size)

	# now read each file and label the data as buy or sell
	# create the moving average days and names list to create the dataframe
	# number of days forward return you would like to predict

	data = pd.DataFrame()

	for i in range(0,len(test_num)):
		filename = 'csvdata/' + symlist.Symbol[test_num[i]] + '.csv'
		temp = pd.read_csv(filename)
		temp = temp[['Date', 'symbolid', 'Open', 'High', 'Low', 'Close', 'AdjClose', 'Volume']]

		mavnames = ['mav5', 'mav10','mav20','mav30','mav50','mav100','mav200']
		mavdays = [5,10,20,30,50,100,200]
		fwdret = 30

		temp = long_returns(temp, fwdret)
		temp = short_returns(temp, fwdret)
		temp = label_data(temp)
		temp = moving_avg_data(temp, mavnames, mavdays)
		temp = temp[['Date','symbolid','buyret','sellret','Action','mav5', 'mav10','mav20','mav30','mav50','mav100']]
		data = data.append(temp)

		#print(str(i/len(test_num)*100) + ' percent setup done')
	data.to_csv('sampledata/'+csvfilename)
	print(csvfilename + ' written to disk')


In [5]:
symlist = pd.read_csv('csvdata/descdata.csv')

create_datasets('train_50.csv', 50)

create_datasets('test_50.csv', 50)

create_datasets('train_250.csv', 250)

create_datasets('test_100.csv', 100)

create_datasets('train_500.csv', 500)

create_datasets('test_150.csv',100)

create_datasets('train_750.csv', 750)

create_datasets('test_250.csv',250)

create_datasets('train_1000.csv', 1000)

create_datasets('test_500.csv',500)

train_50.csv written to disk
test_50.csv written to disk
train_250.csv written to disk
test_100.csv written to disk
train_500.csv written to disk
test_150.csv written to disk
train_750.csv written to disk
test_250.csv written to disk
train_1000.csv written to disk
test_500.csv written to disk
