In [72]:
! pip install sentence-transformers



In [73]:
# handle imports
import tkinter as tk
from tkinter import messagebox
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import data_jaccard
import data_minhash
import sentence_bert
import tdidf_vectorizer
from tkinter import ttk

In [74]:
def load_data():
    # load data for new data
    news_data2 = pd.read_csv('sentiment_news_data2.csv')

    # for empty strings replace with NaN
    news_data2 = news_data2.fillna('')

    all_entries = []

    # loop and add entry/entries to the list
    for i in range(1, 26):
        # get top 1 to 25 for top, sen_top, date
        col  = f'Top{i}'
        col2 = f'Sen_Top{i}'
        col3 = f'Date'
        col_data = news_data2[col]
        col_data2 = news_data2[col2]
        col_data3 = news_data2[col3]
        # loop to add append this data
        for j in range(len(col_data)):
            all_entries.append([col_data[j], col_data2[j], col_data3[j]])
    return all_entries

# print(all_entries[0:5])


In [75]:
# load stock data from file
def load_stock_data(path):
    data = pd.read_csv(path)
    return data

# setup function to get top stocks delta(s) provided a date, dataset, and day range
def get_top_stocks(date, stock_data, days):
    # convert date
    stock_data['Date'] = pd.to_datetime(stock_data['Date'])

    # get start and end dates, within a week
    start = pd.to_datetime(date)
    end   = start + pd.Timedelta(days=days) # could make days a selection in the menu that is passed to this function

    #print(start)
    #print(end)

    filtered_data = stock_data[(stock_data['Date'] >= start) & (stock_data['Date'] <= end)]

    #print(filtered_data)

    # calculate stock change or delta
    # last element minus first element, remove index
    stock_delta = filtered_data.groupby('Stocks').apply(lambda x: x.iloc[-1]['Close'] - x.iloc[0]['Close']).reset_index()
    stock_delta.columns = ['Stocks', 'Close_Delta']

    #print(stock_delta)    

    # sort for top ten deltas to return
    neg_delta = stock_delta[stock_delta['Close_Delta'] < 0]
    neg_ret = neg_delta.sort_values(by='Close_Delta', ascending=True).head(10)

    pos_delta = stock_delta[stock_delta['Close_Delta'] > 0]
    pos_ret = pos_delta.sort_values(by='Close_Delta', ascending=False).head(10)

    return pos_ret, neg_ret

In [76]:
# load stock csv file
stock_data = load_stock_data('updated_stock_data.csv')

# test example
test = False

if test:
    # use function to get the top stocks
    test_date = '2018-11-11'
    top_pos, top_neg = get_top_stocks(test_date, stock_data, 7)
    print(top_pos)
    print("\n")
    print(top_neg)

In [77]:
# jaccard sim
def jac_sim(headline, *args):
    # load data
    all_entries = load_data()
    # apply jaccard sim
    jac_ret = data_jaccard.jac_sim(all_entries, [str(headline)], 5, 10)
    #print(jac_ret)
    # handle return for top ten results
    ret_val = []
    for i in range(10):
        ret_val.append((all_entries[jac_ret[i][0]][2], all_entries[jac_ret[i][0]][0], jac_ret[i][1], all_entries[jac_ret[i][0]][1]))
    return ret_val


# min hash
def min_hash(headline, *args):
    # load data
    all_entries = load_data()
    # apply min hash
    min_ret = data_minhash.min_hash(all_entries, [str(headline)], 5, 10, 10)
    # handle return for top ten results
    ret_val = []
    for i in range(10):
        ret_val.append((all_entries[min_ret[i][1]][2], all_entries[min_ret[i][1]][0], min_ret[i][0],all_entries[min_ret[i][1]][1]))
    return ret_val


# tdidf vectorizer
def td_idf(headline, *args):
    # load in data
    news_data2 = pd.read_csv('sentiment_news_data2.csv')
    news_data2 = news_data2.fillna('')
    all_entries = []
    for i in range(1, 26):
        col = f'Top{i}'
        col2 = f'Sen_Top{i}'
        for j in range(len(news_data2[col])):
            all_entries.append([news_data2['Date'][j], news_data2[col][j], news_data2[col2][j]])

    database = pd.DataFrame(data={
                                'Date'          : [row[0] for row in all_entries],
                                'News_Headlines': [row[1] for row in all_entries],
                                'Sentiment'     : [row[2] for row in all_entries]
                                })

    # apply td-idf vectorizer
    vectorizer = TfidfVectorizer(norm='l2')
    newsValues = database['News_Headlines'].values
    mtx = vectorizer.fit_transform(newsValues)
    mtx.shape
    td_ret = tdidf_vectorizer.vector_search(str(headline), vectorizer, mtx, database, k=10)

    # return top ten results
    ret_val = []
    for i in range(10):
        ret_val.append((td_ret[i][1], str(td_ret[i][2]), str(td_ret[i][3]), str(td_ret[i][4])))

    return ret_val


# sentence bert
def sen_bert(headline, *args):
    # load in data
    all_entries = load_data()
    # apply sentence bert
    bert_ret = sentence_bert.sent_bert(all_entries, [str(headline)], 10, 'all_embed.npy')
    # return top ten results
    ret_val = []
    for i in range(10):
        ret_val.append((all_entries[bert_ret[i][0]][2], all_entries[bert_ret[i][0]][0], bert_ret[i][2], all_entries[bert_ret[i][0]][1]))
    return ret_val

In [None]:
# search headline query
def process_headline(headline, method):
    methods = {
                "Jaccard" : jac_sim,
                "MinHash" : min_hash,
                "TF-IDF"  : td_idf,
                "BERT"    : sen_bert,
              }

    # check if valid
    if method in methods:
        # process data for return
        method_func = methods[method]
        result      = method_func(headline)
        ret_val = ''
        for i in range(10):
            ret_val = ret_val + result[i][0] + str(", ") + result[i][1]  + str(", ") + str(result[i][2]) + str(", ") + str(result[i][3]) + str("\n")
        return result
    else:
        return "Error"


# click function to update the list box
def click():
    method   = method_var.get()
    headline = entry.get()

    if headline:
        result = process_headline([headline], method)
        #print(result)
        # [('3-May-20', 'important test wont involve swab temperature check'), ('3-Apr-21', 'detect fake news challenge test'), ...
        display_res(result)
    else:
        messagebox.showwarning("Warning", "No headline entered.")


# function to open a new window
def display_new(date, stock_data, days):
    new_window = tk.Toplevel(root)
    new_window.title("Stock Results")

    # convert date
    stock_data['Date'] = pd.to_datetime(stock_data['Date'])

    #print(stock_data[0:10])

    top_pos, top_neg = get_top_stocks(date, stock_data, days)

    #print(top_pos[0:10])
    #print(top_neg[0:10])

    # date header
    date_label = tk.Label(new_window, text=f"Search Date of {date}")
    date_label.pack(pady=10)

    # setup frame(s) for delta(s)
    pos_frame = tk.LabelFrame(new_window, text="Top Positive Stock Deltas", padx=10, pady=10)
    pos_frame.pack(padx=10, pady=10)
    neg_frame = tk.LabelFrame(new_window, text="Top Negative Stock Deltas", padx=10, pady=10)
    neg_frame.pack(padx=10, pady=10)

    # treeview for delta(s)
    pos_tree = ttk.Treeview(pos_frame, columns=("Stock", "Delta"), show='headings')
    pos_tree.heading("Stock", text="Stock")
    pos_tree.heading("Delta", text="Close Delta Over Range")
    pos_tree.pack(fill="both", expand=True)
    neg_tree = ttk.Treeview(neg_frame, columns=("Stock", "Delta"), show='headings')
    neg_tree.heading("Stock", text="Stock")
    neg_tree.heading("Delta", text="Close Delta Over Range")
    neg_tree.pack(fill="both", expand=True)

    # input data to treeview(s)
    for i, row in top_pos.iterrows():
        #print(row['Stocks'])
        #print(row['Close_Delta'])
        pos_tree.insert("", "end", values=(row['Stocks'], row['Close_Delta']))
    for i, row in top_neg.iterrows():
        neg_tree.insert("", "end", values=(row['Stocks'], row['Close_Delta']))


# function to select from date
def select_date(event):
    # event seems to be needed, but not used by me
    #print(event)
    selection = result_treeview.selection()
    selected_item = result_treeview.item(selection[0])['values']
    days = int(range_var.get())
    # open new window
    display_new(selected_item[0], stock_data, days)


# function to output or display results
def display_res(results):
    # # print("display : " + str(results))

    # delete old results
    result_treeview.delete(*result_treeview.get_children())

    # loop through results to display date and headline
    for i in results:
        result_treeview.insert("", "end", values=(i[0], i[1], i[2], i[3]))


# load in data
filepath = 'updated_stock_data.csv'
stock_data = load_stock_data(filepath)

# setup tk root
root = tk.Tk()
root.title("Headline Query for Stock Analysis")
# setup headline query
entry_label = tk.Label(root, text="Enter Headline Query:")
entry_label.pack(pady=10)

# setup query input
entry = tk.Entry(root, width=200)
entry.pack(pady=10)

# dropdown menu
method_label = tk.Label(root, text="Select Processing Method:")
method_label.pack(pady=10)
method_var = tk.StringVar()
method_var.set("BERT") # default value
methods_dropdown = tk.OptionMenu(root, method_var, "Jaccard", "MinHash", "TF-IDF", "BERT")
methods_dropdown.pack(pady=10)

# setup the search button
process_button = tk.Button(root, text="Search", command=click)
process_button.pack(pady=10)

# Dropdown menu for range
range_label = tk.Label(root, text="Select Range (Days):")
range_label.pack(pady=2)
# default value for range
range_var = tk.StringVar()
range_var.set("7")
range_dropdown = tk.OptionMenu(root, range_var, *[str(i) for i in range(1, 31)])
range_dropdown.pack(pady=2)

# instructions for selecting headline(s) results
info_label = tk.Label(root, text="Click on the headline(s) results to display stock information")
info_label.pack(pady=2)

# search results
res_label = tk.Label(root, text="")
res_label.pack(pady=2)

# create treeview for displaying results
result_treeview = ttk.Treeview(root, columns=("Date", "Headline", "Score", "Sentiment"), show='headings')
result_treeview.heading("Date", text="Date")
result_treeview.column("Date", width=100)
result_treeview.heading("Headline", text="Headline")
result_treeview.column("Headline", width=900)
result_treeview.heading("Score", text="Score")
result_treeview.column("Score", width=100)
result_treeview.heading("Sentiment", text="Sentiment")
result_treeview.column("Sentiment", width=100)
result_treeview.pack(pady=2, fill="both")
result_treeview.bind('<<TreeviewSelect>>', select_date)

# main loop
root.mainloop()