In [65]:
# handle imports
import tkinter as tk
from tkinter import messagebox
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import data_jaccard
import data_minhash
import sentence_bert
import tdidf_vectorizer
# from data_jaccard import jac_sim

In [None]:
def load_data():
    # load data
    news_data2 = pd.read_csv('sentiment_news_data.csv')

    # for empty strings replace with NaN
    news_data2 = news_data2.fillna('')

    all_entries = []

    # loop and add entry/entries to the list
    #for col in [f'Top{i}' for i in range(1, 26)]:
    for i in range(1, 26):
        # get top 1 to 25
        col  = f'Top{i}'
        col2 = f'Sen_Top{i}'
        col_data = news_data2[col]
        col_data2 = news_data2[col2]
        #for item in col_data:
        for j in range(len(col_data)):
            #all_entries.append([item]) 
            #all_entries.append([item, col_data2])
            all_entries.append([col_data[j], col_data2[j]])
    
    return all_entries

# print(all_entries[0:5])


In [None]:
# setup accessor functions to other python files
def jac_sim(headline, *args):
    # Replace with actual implementation

    all_entries = load_data()

    jac_ret = data_jaccard.jac_sim(all_entries, [str(headline)], 5, 10)

    ret_val = []
    for i in range(10):
        ret_val.append(all_entries[jac_ret[i][0]][0])

    return ret_val


def min_hash(headline, *args):
    all_entries = load_data()

    min_ret = data_minhash.min_hash(all_entries, [str(headline)], 5, 10, 10)

    ret_val = []
    for i in range(10):
        ret_val.append(all_entries[min_ret[i][1]][0])

    return ret_val


def td_idf(headline, *args):
    news_data2 = pd.read_csv('sentiment_news_data.csv')
    news_data2 = news_data2.fillna('')
    all_entries = []
    for i in range(1, 26):  # Columns: Top1 to Top25
        col = f'Top{i}'
        col2 = f'Sen_Top{i}'
        for j in range(len(news_data2[col])):
            all_entries.append([news_data2['Date'][j], news_data2[col][j], news_data2[col2][j]])

    database = pd.DataFrame(data={
        'Date': [row[0] for row in all_entries],
        'News_Headlines': [row[1] for row in all_entries],
        'Sentiment': [row[2] for row in all_entries]
    })

    vectorizer = TfidfVectorizer(norm='l2')
    newsValues = database['News_Headlines'].values
    mtx = vectorizer.fit_transform(newsValues)
    mtx.shape

    td_ret = tdidf_vectorizer.vector_search(str(headline), vectorizer, mtx, database, k=10)

    ret_val = []
    for i in range(10):
        ret_val.append(str(td_ret[i][2]))

    return ret_val


def sen_bert(headline, *args):

    all_entries = load_data()
    bert_ret = sentence_bert.sent_bert(all_entries, [str(headline)], 10, 'all_embed.npy')
    ret_val = []
    for i in range(10):
        ret_val.append(all_entries[bert_ret[i][0]][0])

    return ret_val

In [68]:
# search headline query
def process_headline(headline, method):
    methods = {
                "Jaccard" : jac_sim,
                "MinHash" : min_hash,
                "TF-IDF"  : td_idf,
                "BERT"    : sen_bert,
              }

    # check if valid
    if method in methods:
        method_func = methods[method]
        result = method_func(headline, 5, 10)

        ret_val = ''

        for i in range(10):
            ret_val = ret_val + result[i] + str("\n")

        return ret_val
    else:
        return "Error"

# Define the function to handle the button click event
def on_button_click():
    headline = entry.get()
    method = method_var.get()
    
    if headline and method:
        result = process_headline([headline], method)  # Pass headline and method
        res_label.config(text=result)
    else:
        messagebox.showwarning("Input Error", "Please enter a headline and select a method.")

# apologies for the draft layout, I am not a UX designer

# setup tk root
root = tk.Tk()
root.title("Headline Query for Stock Analysis")
# setup headline query
entry_label = tk.Label(root, text="Enter Headline Query:")
entry_label.pack(pady=10)
# dropdown menu
method_label = tk.Label(root, text="Select Processing Method:")
method_label.pack(pady=10)
method_var = tk.StringVar()
method_var.set("BERT") # default value
methods_dropdown = tk.OptionMenu(root, method_var, "Jaccard", "MinHash", "TF-IDF", "BERT")
methods_dropdown.pack(pady=10)
# setup query input
entry = tk.Entry(root, width=100)
entry.pack(pady=10)
# setup the search button
process_button = tk.Button(root, text="Search", command=on_button_click)
process_button.pack(pady=10)
# search results
res_label = tk.Label(root, text="")
res_label.pack(pady=10)

# main loop
root.mainloop()


one_shin: ["['pep", "'pepp", 'peppa', 'eppa ', 'ppa p', 'pa pi', 'a pig', ' pig ', 'pig i', 'ig in', 'g inv', ' inva', 'invad', 'nvade', 'vades', 'ades ', 'des p', 'es po', 's por', ' pork', 'porkl', 'orkla', 'rklan', 'kland', "land'", "and']"]
m2           : <datasketch.minhash.MinHash object at 0x00000239FDB33E00>
