## Data Ingestion | Preprocessing | Vectorization Pipeline 

In [56]:
import os

from openai import api_key

ROOT_DIR = 'C:\Development\EquitySenseAI'
os.chdir(ROOT_DIR)

In [57]:
from dotenv import load_dotenv, find_dotenv

DOT_ENV_PATH = find_dotenv()

load_dotenv(DOT_ENV_PATH)

True

In [58]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
TICKER_ID = "SBUX"
COMPANY_NAME = "STARBUCKS Corp"
PINECONE_INDEX_NAME = "equity-sense-vdb"
SOURCE_FILE = 'financials/{}_income_statement.csv'.format(TICKER_ID)

### Preprocess Income Statement CSV Data

In [59]:
import pandas as pd

# Change Column Name for Better Readability
income_data = pd.read_csv(SOURCE_FILE)
income_data.rename({'Unnamed: 0':'Feature'}, axis=1, inplace=True)

In [60]:
features = income_data['Feature'].unique()

income_per_feature = {}
for feature in features:
    year = income_data[income_data['Feature'] == feature]
    income_per_feature[feature] = year

income_per_feature['Tax Effect Of Unusual Items']

Unnamed: 0,Feature,2023-09-30,2022-09-30,2021-09-30,2020-09-30
0,Tax Effect Of Unusual Items,16402000.0,-10304000.0,149925600.0,-57412200.0


In [61]:
features = list(income_per_feature.keys())
features

['Tax Effect Of Unusual Items',
 'Tax Rate For Calcs',
 'Normalized EBITDA',
 'Total Unusual Items',
 'Total Unusual Items Excluding Goodwill',
 'Net Income From Continuing Operation Net Minority Interest',
 'Reconciled Depreciation',
 'Reconciled Cost Of Revenue',
 'EBITDA',
 'EBIT',
 'Net Interest Income',
 'Interest Expense',
 'Interest Income',
 'Normalized Income',
 'Net Income From Continuing And Discontinued Operation',
 'Total Expenses',
 'Rent Expense Supplemental',
 'Total Operating Income As Reported',
 'Diluted Average Shares',
 'Basic Average Shares',
 'Diluted EPS',
 'Basic EPS',
 'Diluted NI Availto Com Stockholders',
 'Net Income Common Stockholders',
 'Net Income',
 'Minority Interests',
 'Net Income Including Noncontrolling Interests',
 'Net Income Continuous Operations',
 'Tax Provision',
 'Pretax Income',
 'Other Income Expense',
 'Special Income Charges',
 'Gain On Sale Of Ppe',
 'Gain On Sale Of Business',
 'Restructuring And Mergern Acquisition',
 'Earnings From 

In [62]:
years = list(income_per_feature['Total Revenue'].columns[1:])
years

['2023-09-30', '2022-09-30', '2021-09-30', '2020-09-30']

### Data Transformation into Descriptive Rows with Financial Information for each feature. 

In [63]:
financial_data_list = []
for feature in features:
    content = "The {0} {1} ".format(COMPANY_NAME,feature)
    filtered = income_per_feature[feature]
    for year in years:
        content += "For the date {0}".format(year)
        row_value = filtered[year].reset_index(drop=True)[0] / 1000000000
        row = " was {0} Billions USD. ".format(row_value)
        content += row
    financial_data_list.append(content)
financial_data_list[:5]

['The STARBUCKS Corp Tax Effect Of Unusual Items For the date 2023-09-30 was 0.016402 Billions USD. For the date 2022-09-30 was -0.010304 Billions USD. For the date 2021-09-30 was 0.1499256 Billions USD. For the date 2020-09-30 was -0.0574122 Billions USD. ',
 'The STARBUCKS Corp Tax Rate For Calcs For the date 2023-09-30 was 2.36e-10 Billions USD. For the date 2022-09-30 was 2.24e-10 Billions USD. For the date 2021-09-30 was 2.16e-10 Billions USD. For the date 2020-09-30 was 2.0599999999999999e-10 Billions USD. ',
 'The STARBUCKS Corp Normalized EBITDA For the date 2023-09-30 was 7.3328 Billions USD. For the date 2022-09-30 was 6.2902 Billions USD. For the date 2021-09-30 was 6.6567 Billions USD. For the date 2020-09-30 was 3.3833 Billions USD. ',
 'The STARBUCKS Corp Total Unusual Items For the date 2023-09-30 was 0.0695 Billions USD. For the date 2022-09-30 was -0.046 Billions USD. For the date 2021-09-30 was 0.6941 Billions USD. For the date 2020-09-30 was -0.2787 Billions USD. ',


This are the first 5 rows for our new financial data python List.

### Convert list into LangChain Document Schema prior to Vectorization

In [64]:
from langchain.schema import Document

ticker_metadata = {"source": SOURCE_FILE, "ticker_id": TICKER_ID}

# Convert list of strings into LangChain Documents
financial_documents = [Document(page_content=row,metadata=ticker_metadata) for row in financial_data_list]
financial_documents

[Document(metadata={'source': 'financials/SBUX_income_statement.csv', 'ticker_id': 'SBUX'}, page_content='The STARBUCKS Corp Tax Effect Of Unusual Items For the date 2023-09-30 was 0.016402 Billions USD. For the date 2022-09-30 was -0.010304 Billions USD. For the date 2021-09-30 was 0.1499256 Billions USD. For the date 2020-09-30 was -0.0574122 Billions USD. '),
 Document(metadata={'source': 'financials/SBUX_income_statement.csv', 'ticker_id': 'SBUX'}, page_content='The STARBUCKS Corp Tax Rate For Calcs For the date 2023-09-30 was 2.36e-10 Billions USD. For the date 2022-09-30 was 2.24e-10 Billions USD. For the date 2021-09-30 was 2.16e-10 Billions USD. For the date 2020-09-30 was 2.0599999999999999e-10 Billions USD. '),
 Document(metadata={'source': 'financials/SBUX_income_statement.csv', 'ticker_id': 'SBUX'}, page_content='The STARBUCKS Corp Normalized EBITDA For the date 2023-09-30 was 7.3328 Billions USD. For the date 2022-09-30 was 6.2902 Billions USD. For the date 2021-09-30 was 

### Load Embedding Model for Vectorization

In [65]:
from langchain_openai.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY, model='text-embedding-ada-002')

### Loading Data into PineCone Vector Database

In [66]:
from tqdm.autonotebook import tqdm
from langchain_pinecone import PineconeVectorStore

index_name = PINECONE_INDEX_NAME

pinecone = PineconeVectorStore.from_documents(
    financial_documents, embeddings, index_name=index_name
)