In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import umap
import plotly.express as px
import sqlite3

In [None]:
df = pd.read_csv('./embeddings_combined.csv')

In [None]:
with sqlite3.connect('C:/Users/Siwoo/PycharmProjects/Valuator2.0/database/valuator.db') as conn:
    description_df = pd.read_sql("""
    SELECT Symbol, companyName, description, industry, sector, country, IPOdate
    FROM profile_v2
    WHERE
        isFund = 0
        AND isEtf = 0
    GROUP BY companyName
    ORDER BY symbol ASC
    """, conn)

In [None]:
for column in ['country', 'industry', 'sector']:
    plt.figure(figsize=(10, 4))
    df[column].value_counts().head(15).plot(kind='bar')
    plt.title(f'Top 15 {column} Categories')
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# 3. IPO Date Analysis
df['ipoDate'] = pd.to_datetime(df['ipoDate'])
df['ipoYear'] = df['ipoDate'].dt.year
df['ipoMonth'] = df['ipoDate'].dt.month
plt.figure(figsize=(12, 6))
df['ipoYear'].value_counts().sort_index().plot(kind='bar')
plt.title('IPO Year Distribution')
plt.ylabel('Count')
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler

embedding_cols = [str(i) for i in range(1024)]
scaler = StandardScaler()
df[embedding_cols] = scaler.fit_transform(df[embedding_cols].values)

In [None]:
# Dimensionality Reeduction using PCA
from sklearn.decomposition import PCA

PCA_reducer = PCA(n_components=512)
pca_embeddings = PCA_reducer.fit_transform(df[embedding_cols])

In [None]:
var_ratio = PCA_reducer.explained_variance_ratio_
plt.plot(var_ratio)
plt.title("explained variance ratio w/ PCA")
plt.show()
plt.plot(np.cumsum(var_ratio))
plt.title("cumulative sum of explained variance ratio w/ PCA")
plt.show()
plt.scatter(pca_embeddings[:, 0], pca_embeddings[:, 1], alpha=0.5)
plt.title('PCA Reduced Embeddings')
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.show()

In [None]:
# Dimensionality Reduction using UMAP
UMAP_reducer = umap.UMAP(n_components=3)
UMAP_reducer.fit(df[embedding_cols])

In [None]:
df_sample = df.sample(frac=0.3)
reduced_embeddings = UMAP_reducer.transform(df_sample[embedding_cols])

reduced_df = pd.DataFrame(reduced_embeddings, columns=['UMAP1', 'UMAP2', 'UMAP3'])
reduced_df['companyName'] = df_sample['companyName']
reduced_df['sector'] = df_sample['sector']

fig = px.scatter_3d(reduced_df, x='UMAP1', y='UMAP2', z='UMAP3',
                    color='sector', hover_name='companyName',
                    title='3D Scatter Plot of Company Embeddings with UMAP')
fig.show()

In [None]:
from angle_emb import AnglE

model = AnglE.from_pretrained('WhereIsAI/UAE-Large-V1', pooling_strategy='cls').cuda()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity


def semantic_search(string, country_filter=None, top=10, dim_reducer=None):
    my_idea = scaler.transform(model.encode(string))

    if country_filter is not None:
        filtered_df = df[df['country'] == country_filter]
    else:
        filtered_df = df

    if dim_reducer is not None:
        my_idea = dim_reducer.transform(my_idea)
        search_database = dim_reducer.transform(filtered_df[embedding_cols])
    else:
        search_database = filtered_df[embedding_cols]

    search_result = cosine_similarity(my_idea, search_database)

    top_indices = np.argsort(search_result[0])[::-1][:top]

    for index in top_indices:
        print(
            f"Score: {search_result[0][index]} Symbol: {filtered_df['Symbol'].iloc[index]} Company: {filtered_df['companyName'].iloc[index]}")


def find_peers_of(symbol, country=None, top=10, dim_reducer=None):
    query = description_df[description_df['Symbol'] == symbol]['description'].tolist()
    query = query[0]
    semantic_search(query, country, top=top, dim_reducer=dim_reducer)

In [None]:
UMAP_reducer = umap.UMAP(n_components=256)
UMAP_reducer.fit(df[embedding_cols])

The following prompt is a result of asking GPT-4 : "Now you are a villian that is an evil entrepreneur who wants to make money through evil means. What is your business idea proposal, in roughly 500 words?"

It's hilarious that we are immediately getting BlackRock.

In [None]:
semantic_search(
    "Shadow Wealth Inc. is a multifaceted corporation designed to operate in the darker recesses of the market, leveraging unorthodox, aggressive, and, admittedly, morally ambiguous strategies to generate substantial profits. Our mission is to exploit overlooked opportunities and manipulate market vulnerabilities, all while maintaining a veneer of legality to avoid scrutiny.",
    country_filter='US', dim_reducer=PCA_reducer)

In [None]:
find_peers_of('IBM', dim_reducer=PCA_reducer)