In [40]:
from DataProcessor import Data_Processor
from DataProcessor import clean
import numpy as np
import json

## predict sentiment

In [32]:
import time
import pandas as pd
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec
import pickle

w2v_model = Word2Vec.load('/Users/yunzehui/Desktop/Morgan Stanley_Zehui/Sentiment_Prediction_Model-LSTM/model.w2v')
model = load_model('/Users/yunzehui/Desktop/Morgan Stanley_Zehui/Sentiment_Prediction_Model-LSTM/model.h5') 
with open('/Users/yunzehui/Desktop/Morgan Stanley_Zehui/Sentiment_Prediction_Model-LSTM/tokenizer.pkl', 'rb') as handle: tokenizer = pickle.load(handle) 
with open('/Users/yunzehui/Desktop/Morgan Stanley_Zehui/Sentiment_Prediction_Model-LSTM/encoder.pkl', 'rb') as handle: encoder = pickle.load(handle)

SEQUENCE_LENGTH = 300

Using TensorFlow backend.


In [46]:
# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)

def decode_sentiment(score, include_neutral=True):
    if include_neutral:        
        label = NEUTRAL
        if score <= SENTIMENT_THRESHOLDS[0]:
            label = NEGATIVE
        elif score >= SENTIMENT_THRESHOLDS[1]:
            label = POSITIVE

        return label
    else:
        return NEGATIVE if score < 0.5 else POSITIVE
    
def predict(text_list_processed, include_neutral=True):
    start_at = time.time()
    result = pd.DataFrame(columns=['text', 'label', 'score', 'elapsed_time'])
    for text in text_list_processed:
        # Tokenize text
        x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=SEQUENCE_LENGTH)
        # Predict
        score = model.predict([x_test])[0]
        # Decode sentiment
        label = decode_sentiment(score, include_neutral=include_neutral)
        elapsed_time = time.time()-start_at
        result = result.append(pd.DataFrame({'text':[text],'label':[label],'score':[float(score)],'elapsed_time':[elapsed_time]}),ignore_index=True)
    return result  

def datelist(start_month, end_month):
    start_year = int(start_month[:4])
    start_month = int(start_month[-2:])
    end_year = int(end_month[:4])
    end_month = int(end_month[-2:])
    if start_year == end_year:
        month_range = range(start_month, end_month + 1)
        date_list = ["{year}-{month:0=2d}".format(year=str(start_year), month=M) for M in month_range]
        return date_list
    year_range = range(start_year + 1, end_year)
    start_year_month_range = range(start_month, 13)
    end_year_month_range = range(1, end_month + 1)
    date_list = ["{year}-{month:0=2d}".format(year=str(start_year), month=M) for M in start_year_month_range]
    date_list += ["{year}-{month:0=2d}".format(year=str(Y), month=M) for Y in year_range for M in range(1, 13)]
    date_list += ["{year}-{month:0=2d}".format(year=str(end_year), month=M) for M in end_year_month_range]
    return date_list

# Discount Brokerage

In [38]:
start_month='2017-06'
end_month='2020-05'

DP=Data_Processor(start_month, end_month,
                  template=["Data/Discount Brokerage/Charles Schwab/CharlesSchwab","Data/Discount Brokerage/TD Ameritrade/TDAmeritrade"])
DP.readdata()
print(DP.datanums())
DP.specifylang()
DP.removenoise() 
DP.clean()
DP.tokenizetext()
print(DP.datanums())

([5905, 4595, 5955, 4689, 6305, 5638, 7616, 6629, 6886, 6730, 7852, 6228, 7225, 5338, 5014, 5777, 8095, 5153, 5648, 5986, 5346, 5183, 7338, 8259, 7059, 6153, 4725, 5025, 9961, 8858, 5964, 6177, 6385, 9195, 8665, 8516], 236073)
([2573, 1914, 2732, 2013, 2766, 2355, 3286, 2926, 3850, 3784, 4184, 2982, 3863, 2330, 2436, 3307, 3892, 2590, 2957, 2673, 2539, 2588, 3610, 4146, 4296, 2829, 2404, 2411, 4871, 4125, 2875, 3392, 3677, 6142, 5324, 5329], 119971)


In [57]:
len(DP.textdata())

36

In [66]:
dl = datelist(start_month, end_month)
filepath=[]
for i in range(len(dl)):
    pathi = 'result/Discount Brokerage/'+dl[i]+'.json'
    filepath.append(pathi)

In [68]:
# run 1 time
for i in range(len(dl)):
    result = predict(DP.textdata()[dl[i]])
    with open(filepath[i],"w") as f:
        json.dump(result.to_json(),f)
        print(dl[i], " completed")

2017-06  completed
2017-07  completed
2017-08  completed
2017-09  completed
2017-10  completed
2017-11  completed
2017-12  completed
2018-01  completed
2018-02  completed
2018-03  completed
2018-04  completed
2018-05  completed
2018-06  completed
2018-07  completed
2018-08  completed
2018-09  completed
2018-10  completed
2018-11  completed
2018-12  completed
2019-01  completed
2019-02  completed
2019-03  completed
2019-04  completed
2019-05  completed
2019-06  completed
2019-07  completed
2019-08  completed
2019-09  completed
2019-10  completed
2019-11  completed
2019-12  completed
2020-01  completed
2020-02  completed
2020-03  completed
2020-04  completed
2020-05  completed


# Robo Advisor

In [90]:
start_month='2017-06'
end_month='2019-04'
#end_month='2020-05'

DP_robo=Data_Processor(start_month, end_month,
                  template=["Data/Robo/Betterment/Betterment","Data/Robo/Robo Advisor/RoboAdvisor","Data/Robo/wealthfront/Wealthfront"])
DP_robo.readdata()
print(DP_robo.datanums())
DP_robo.specifylang()
DP_robo.removenoise() 
DP_robo.clean()
DP_robo.tokenizetext()
print(DP_robo.datanums())

([6253, 6252, 6243, 6807, 6834, 7011, 6094, 7044, 7384, 6534, 5531, 4156, 3715, 3603, 3364, 2667, 3516, 2997, 5211, 3913, 3327, 4114, 3736], 116306)
([1701, 1812, 1712, 1714, 1887, 1599, 1433, 1726, 2204, 1801, 1239, 1241, 1036, 1101, 991, 820, 1259, 900, 1264, 964, 849, 917, 750], 30920)


In [91]:
dl = datelist(start_month, end_month)
filepath=[]
for i in range(len(dl)):
    pathi = 'result/Robo/'+dl[i]+'.json'
    filepath.append(pathi)

In [92]:
# run 1 time
for i in range(len(dl)):
    result = predict(DP_robo.textdata()[dl[i]])
    with open(filepath[i],"w") as f:
        json.dump(result.to_json(),f)
        print(dl[i], " completed")

2017-06  completed
2017-07  completed
2017-08  completed
2017-09  completed
2017-10  completed
2017-11  completed
2017-12  completed
2018-01  completed
2018-02  completed
2018-03  completed
2018-04  completed
2018-05  completed
2018-06  completed
2018-07  completed
2018-08  completed
2018-09  completed
2018-10  completed
2018-11  completed
2018-12  completed
2019-01  completed
2019-02  completed
2019-03  completed
2019-04  completed


# Bank Brokerage

In [94]:
start_month='2017-06'
end_month='2020-05'

DP_bank=Data_Processor(start_month, end_month,
                  template=["Data/Bank Brokerage/JPMorgan/JPMorgan","Data/Bank Brokerage/Wells Fargo/WellsFargo"])
DP_bank.readdata()
print(DP_bank.datanums())
DP_bank.specifylang()
DP_bank.removenoise() 
DP_bank.clean()
DP_bank.tokenizetext()
print(DP_bank.datanums())

([20009, 20097, 19703, 19975, 20279, 20266, 20179, 20251, 20389, 20072, 19927, 19586, 19379, 19780, 20081, 20371, 20156, 20222, 19774, 19947, 20409, 20379, 20188, 20448, 19625, 20097, 19771, 19770, 19962, 19550, 18159, 19838, 19513, 20402, 20326, 20333], 719213)
([6307, 5950, 6118, 6900, 6554, 7237, 6919, 7737, 7275, 7332, 7392, 7067, 7892, 7912, 7503, 8228, 8442, 8109, 8331, 8556, 8928, 9044, 9871, 9840, 10535, 10530, 9920, 9680, 10326, 10783, 9700, 9711, 9695, 10573, 11293, 10237], 308427)


In [95]:
dl = datelist(start_month, end_month)
filepath=[]
for i in range(len(dl)):
    pathi = 'result/Bank Brokerage/'+dl[i]+'.json'
    filepath.append(pathi)

In [96]:
# run 1 time
for i in range(len(dl)):
    result = predict(DP_bank.textdata()[dl[i]])
    with open(filepath[i],"w") as f:
        json.dump(result.to_json(),f)
        print(dl[i], " completed")

2017-06  completed
2017-07  completed
2017-08  completed
2017-09  completed
2017-10  completed
2017-11  completed
2017-12  completed
2018-01  completed
2018-02  completed
2018-03  completed
2018-04  completed
2018-05  completed
2018-06  completed
2018-07  completed
2018-08  completed
2018-09  completed
2018-10  completed
2018-11  completed
2018-12  completed
2019-01  completed
2019-02  completed
2019-03  completed
2019-04  completed
2019-05  completed
2019-06  completed
2019-07  completed
2019-08  completed
2019-09  completed
2019-10  completed
2019-11  completed
2019-12  completed
2020-01  completed
2020-02  completed
2020-03  completed
2020-04  completed
2020-05  completed


# Full Brokerage

In [97]:
start_month='2017-06'
end_month='2020-05'

# No Goldman Sacks 
DP_full=Data_Processor(start_month, end_month,
                  template=["Data/Full Brokerage/Merrill Lynch/MerillLynch","Data/Full Brokerage/Morgan_Stanley/Morgan_Stanley"])
DP_full.readdata()
print(DP_full.datanums())
DP_full.specifylang()
DP_full.removenoise() 
DP_full.clean()
DP_full.tokenizetext()
print(DP_full.datanums())

([11860, 11500, 11927, 11787, 11803, 12001, 11953, 13008, 11282, 10505, 10880, 11868, 12213, 11008, 12065, 11880, 11241, 11136, 11461, 10797, 11461, 11374, 10769, 11536, 11178, 10602, 11151, 10920, 11474, 10564, 10199, 10514, 10625, 11361, 10676, 11321], 407900)
([2538, 2805, 3250, 2888, 2799, 2892, 2915, 3276, 2602, 2400, 2496, 3404, 3163, 2475, 2697, 3379, 2725, 2333, 2825, 2296, 2639, 3213, 2627, 3287, 3290, 1965, 2543, 3008, 2548, 2763, 2591, 2683, 2670, 2970, 2923, 3250], 101128)


In [98]:
dl = datelist(start_month, end_month)
filepath=[]
for i in range(len(dl)):
    pathi = 'result/Full Brokerage/'+dl[i]+'.json'
    filepath.append(pathi)

In [99]:
# run 1 time
for i in range(len(dl)):
    result = predict(DP_full.textdata()[dl[i]])
    with open(filepath[i],"w") as f:
        json.dump(result.to_json(),f)
        print(dl[i], " completed")

2017-06  completed
2017-07  completed
2017-08  completed
2017-09  completed
2017-10  completed
2017-11  completed
2017-12  completed
2018-01  completed
2018-02  completed
2018-03  completed
2018-04  completed
2018-05  completed
2018-06  completed
2018-07  completed
2018-08  completed
2018-09  completed
2018-10  completed
2018-11  completed
2018-12  completed
2019-01  completed
2019-02  completed
2019-03  completed
2019-04  completed
2019-05  completed
2019-06  completed
2019-07  completed
2019-08  completed
2019-09  completed
2019-10  completed
2019-11  completed
2019-12  completed
2020-01  completed
2020-02  completed
2020-03  completed
2020-04  completed
2020-05  completed
