In [1]:
import numpy as np
import pandas as pd
import math
import datetime
import os
from scipy import stats
import sys
#import statsmodels.formula.api as sm

In [2]:
result = {}
tickerFile = "./sp1500data/sp1500tickerWoSlash.csv"
with open(tickerFile) as csvfile:
    tickers = csvfile.read().split("\n")
i = 0
for ticker in tickers:
    pricepFile = "./sp1500data/sp1500data/sp1500adjustedPrice/daily_adjusted_"+ticker+".csv"
    sentimentFile = "./sp1500data/sp1500data/sp1500sentiment/"+ticker+"Sentiment.csv"
    priceData = pd.read_csv(pricepFile, sep=",",low_memory=False)
    sentimentData = pd.read_csv(sentimentFile, sep=",", skiprows=1,
                         names = ["UTCTime","Ticker","Sentiment","Date"], low_memory=False)
    if not priceData.empty:
        priceData.timestamp = priceData.timestamp.apply(lambda x: pd.to_datetime(x,format='%Y-%m-%d').date())
        priceData = priceData.set_index("timestamp").sort_index()
        priceData["Return"] = priceData.adjusted_close/priceData.adjusted_close.shift(1) - 1
        sentimentData = sentimentData.drop("UTCTime", axis=1)
        sentimentData.Date = sentimentData.Date.apply(lambda x: pd.to_datetime(x,format='%Y-%m-%d').date())
        sentimentData = sentimentData.set_index("Date")
        Score = sentimentData.groupby("Date")["Sentiment"].apply(lambda x: np.log((list(x).count(1)+1)/(list(x).count(-1)+1)))
        result[ticker] = pd.merge(priceData, Score.to_frame(), how = "left", left_index = True, right_index = True)
    i += 1
    sys.stdout.write( '\r Processing Data: %.2f%%-%s' % (((i * 100 / (len(tickers)))), ticker))
    sys.stdout.flush()

 Processing Data: 100.00%-SAIC

In [30]:
def moving_average(data, n): # data is a one-dimensional list, n is days of moving-average
    if n<len(data):
        data = np.array(data)
        residual = (data[n:]-data[:-n])/n
        reslist=np.append(np.mean(data[:n]),residual)
        firstndaysMA = [np.mean(data[:i+1]) for i in range(n-1)]
        return np.append(firstndaysMA,np.cumsum(reslist))
    else:
        return [np.mean(data[:i+1]) for i in range(len(data))]

In [31]:
i=0
nshort=5
nlong=26
for ticker in tickers:
    result[ticker]["shortMA"]=moving_average(result[ticker]["adjusted_close"],nshort)
    result[ticker]["longMA"]=moving_average(result[ticker]["adjusted_close"],nlong)
    result[ticker]["shortMA/longMA"]=result[ticker]["shortMA"]/result[ticker]["longMA"]
    i+=1
    sys.stdout.write( '\r Processing Data: %.2f%%-%s' % (((i * 100 / (len(tickers)))), ticker))
    sys.stdout.flush()

 Processing Data: 100.00%-SAIC

In [33]:
####initial code to trade top 20% of POS sentiment score the stocks
tickersUse = [k for k in result.keys()]
nclass = 5
quantileReturn = {i:[] for i in range(nclass)}
Dates = result["AAPL"].index
dailyTotalTicker=[]
returnEachDay={}
for i in range(len(Dates)-1):
    Date = Dates[i]
    scoreList = []
    nextdayReturnList = []
    dailyTotalTicker=[]
    maRatio=[]
    fileName="./sp1500data/combinedMAData/combined_"+str(Date)+".csv"
    for ticker in tickersUse:
        if Date in result[ticker].index and Dates[i+1] in result[ticker].index:
            if not np.isnan(result[ticker].loc[Date]["Sentiment"]) and not np.isnan(result[ticker].loc[Dates[i+1]]["Return"]):
                scoreList.append(result[ticker].loc[Date]["Sentiment"])
                nextdayReturnList.append(result[ticker].loc[Dates[i+1]]["Return"]) 
                dailyTotalTicker.append(ticker)
                maRatio.append(result[ticker].loc[Date]["shortMA/longMA"])
    returnEachDay[Date]=pd.DataFrame()
    returnEachDay[Date]["ticker"]=np.array(dailyTotalTicker)
    returnEachDay[Date]["SentimentScore"]=np.array(scoreList)
    returnEachDay[Date]["nextDayReturn"]=np.array(nextdayReturnList)
    returnEachDay[Date]["maRatio"]=np.array(np.array(maRatio))
    returnEachDay[Date].to_csv(fileName,sep=",")
    sys.stdout.write( '\r Processing Data: %.2f%%-%s' % (((i * 100 / (len(Dates)))), str(Date)))
    sys.stdout.flush()

 Processing Data: 99.96%-2017-11-20

In [41]:
for i in range(len(Dates)-1):
    Date = Dates[i]
    returnEachDay[Date]["price/high"]=0

In [33]:
from TrendStratV3 import TrendStrat

In [34]:
a=TrendStrat(tickerList="sp1500ticker.csv",stratPeriod="12d")

In [36]:
pricedata=TrendStrat.readData(a)

In [37]:
finalData=TrendStrat.generateAllTickers(a,pricedata)

 Generating Data: 7.69%-BRKX

  dailyReturn.append(rawData["adjusted_close"][i]/rawData["adjusted_close"][i-1]-1)


 Generating Data: 100.00%-SPYK

In [50]:
for i in range(len(Dates)-1):
    Date = Dates[i]
    fileName="./sp1500data/combinedData/combined_"+str(Date)+".csv"
    for ticker in returnEachDay[Date]["ticker"]:
        sys.stdout.write( '\r Processing Data: %.2f%%-%s' % (((i * 100 / (len(Dates)))), str(Date)))
        sys.stdout.flush()
        if finalData[ticker]['price/high'][str(Date)] and isinstance(finalData[ticker]['price/high'][str(Date)], (int, float, complex)):
            returnEachDay[Date]["price/high"][returnEachDay[Date]["ticker"]==ticker]=finalData[ticker]['price/high'][str(Date)]
        else:
            returnEachDay[Date]["price/high"][returnEachDay[Date]["ticker"]==ticker]=0
    returnEachDay[Date].to_csv(fileName,sep=",")

 Processing Data: 0.00%-2000-01-03

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


 Processing Data: 99.16%-2017-09-29

In [109]:
sum1=1
for i in range(len(Dates)-1):
    if i%400 ==0: print(i)
    Date = Dates[i]
    if (topSentimentList[Date]['nextDayReturn'].sum()/(len(topSentimentList[Date])+1))!=float('Inf'):
        sum1*=(1+topSentimentList[Date]['nextDayReturn'].sum()/(len(topSentimentList[Date])+1))
        count+=1

0
400
800
1200
1600
2000
2400
2800
3200
3600
4000
4400


In [4]:
topSentimentList={}
posSentimentList={}
for i in range(len(Dates)-1):
    if i%400 ==0: print(i)
    Date = Dates[i]
    posSentimentList[Date]=returnEachDay[Date][returnEachDay[Date]["SentimentScore"]>0]
    n = len(posSentimentList)//5
    topSentimentList[Date]= posSentimentList[Date].nlargest(n,'SentimentScore')
    sys.stdout.write( '\r Processing Data: %.2f%%-%s' % (((i * 100 / (len(Dates)))), str(Date)))
    sys.stdout.flush()

0
400
800
1200
1600
2000
2400
2800
3200
3600
4000
4400
