## Data Ingestion | Preprocessing | Vectorization Pipeline 

In [12]:
import os

ROOT_DIR = 'C:\Development\EquitySenseAI'
os.chdir(ROOT_DIR)

In [13]:
from dotenv import load_dotenv, find_dotenv

DOT_ENV_PATH = find_dotenv()

load_dotenv(DOT_ENV_PATH)

True

In [14]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
TICKER_ID = "META"
COMPANY_NAME = "Meta Inc"
PINECONE_INDEX_NAME = "equity-sense-vdb"
SOURCE_FILE = 'financials/{}_income_statement.csv'.format(TICKER_ID)

### Preprocess Income Statement CSV Data

In [15]:
import pandas as pd

# Change Column Name for Better Readability
income_data = pd.read_csv(SOURCE_FILE)
income_data.rename({'Unnamed: 0':'Feature'}, axis=1, inplace=True)

In [16]:
features = income_data['Feature'].unique()

income_per_feature = {}
for feature in features:
    year = income_data[income_data['Feature'] == feature]
    income_per_feature[feature] = year

income_per_feature['Tax Effect Of Unusual Items']

Unnamed: 0,Feature,2023-12-31,2022-12-31,2021-12-31,2020-12-31,2019-12-31
0,Tax Effect Of Unusual Items,-64416000.0,-15795000.0,-23380000.0,-15738000.0,


In [17]:
features = list(income_per_feature.keys())
features

['Tax Effect Of Unusual Items',
 'Tax Rate For Calcs',
 'Normalized EBITDA',
 'Total Unusual Items',
 'Total Unusual Items Excluding Goodwill',
 'Net Income From Continuing Operation Net Minority Interest',
 'Reconciled Depreciation',
 'Reconciled Cost Of Revenue',
 'EBITDA',
 'EBIT',
 'Net Interest Income',
 'Interest Expense',
 'Interest Income',
 'Normalized Income',
 'Net Income From Continuing And Discontinued Operation',
 'Total Expenses',
 'Total Operating Income As Reported',
 'Diluted Average Shares',
 'Basic Average Shares',
 'Diluted EPS',
 'Basic EPS',
 'Diluted NI Availto Com Stockholders',
 'Average Dilution Earnings',
 'Net Income Common Stockholders',
 'Otherunder Preferred Stock Dividend',
 'Net Income',
 'Net Income Including Noncontrolling Interests',
 'Net Income Continuous Operations',
 'Tax Provision',
 'Pretax Income',
 'Other Income Expense',
 'Other Non Operating Income Expenses',
 'Gain On Sale Of Security',
 'Net Non Operating Interest Income Expense',
 'Tota

In [18]:
years = list(income_per_feature['Total Revenue'].columns[1:])
years

['2023-12-31', '2022-12-31', '2021-12-31', '2020-12-31', '2019-12-31']

### Data Transformation into Descriptive Rows with Financial Information for each feature. 

In [19]:
financial_data_list = []
for feature in features:
    content = "The {0} {1} ".format(COMPANY_NAME,feature)
    filtered = income_per_feature[feature]
    for year in years:
        content += "For the date {0}".format(year)
        row_value = filtered[year].reset_index(drop=True)[0] / 1000000000
        row = " was {0} Billions USD. ".format(row_value)
        content += row
    financial_data_list.append(content)
financial_data_list[:5]

['The Meta Inc Tax Effect Of Unusual Items For the date 2023-12-31 was -0.064416 Billions USD. For the date 2022-12-31 was -0.015795 Billions USD. For the date 2021-12-31 was -0.02338 Billions USD. For the date 2020-12-31 was -0.015738 Billions USD. For the date 2019-12-31 was nan Billions USD. ',
 'The Meta Inc Tax Rate For Calcs For the date 2023-12-31 was 1.7599999999999999e-10 Billions USD. For the date 2022-12-31 was 1.95e-10 Billions USD. For the date 2021-12-31 was 1.6700000000000002e-10 Billions USD. For the date 2020-12-31 was 1.22e-10 Billions USD. For the date 2019-12-31 was nan Billions USD. ',
 'The Meta Inc Normalized EBITDA For the date 2023-12-31 was 59.418 Billions USD. For the date 2022-12-31 was 37.771 Billions USD. For the date 2021-12-31 was 55.414 Billions USD. For the date 2020-12-31 was 39.662 Billions USD. For the date 2019-12-31 was nan Billions USD. ',
 'The Meta Inc Total Unusual Items For the date 2023-12-31 was -0.366 Billions USD. For the date 2022-12-31 

This are the first 5 rows for our new financial data python List.

### Convert list into LangChain Document Schema prior to Vectorization

In [20]:
from langchain.schema import Document

ticker_metadata = {"source": SOURCE_FILE, "ticker_id": TICKER_ID}

# Convert list of strings into LangChain Documents
financial_documents = [Document(page_content=row,metadata=ticker_metadata) for row in financial_data_list]
financial_documents

[Document(metadata={'source': 'financials/META_income_statement.csv', 'ticker_id': 'META'}, page_content='The Meta Inc Tax Effect Of Unusual Items For the date 2023-12-31 was -0.064416 Billions USD. For the date 2022-12-31 was -0.015795 Billions USD. For the date 2021-12-31 was -0.02338 Billions USD. For the date 2020-12-31 was -0.015738 Billions USD. For the date 2019-12-31 was nan Billions USD. '),
 Document(metadata={'source': 'financials/META_income_statement.csv', 'ticker_id': 'META'}, page_content='The Meta Inc Tax Rate For Calcs For the date 2023-12-31 was 1.7599999999999999e-10 Billions USD. For the date 2022-12-31 was 1.95e-10 Billions USD. For the date 2021-12-31 was 1.6700000000000002e-10 Billions USD. For the date 2020-12-31 was 1.22e-10 Billions USD. For the date 2019-12-31 was nan Billions USD. '),
 Document(metadata={'source': 'financials/META_income_statement.csv', 'ticker_id': 'META'}, page_content='The Meta Inc Normalized EBITDA For the date 2023-12-31 was 59.418 Bill

### Load Embedding Model for Vectorization

In [21]:
from langchain_openai.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY, model='text-embedding-ada-002')

### Loading Data into PineCone Vector Database

In [22]:
from tqdm.autonotebook import tqdm
from langchain_pinecone import PineconeVectorStore

index_name = PINECONE_INDEX_NAME

pinecone = PineconeVectorStore.from_documents(
    financial_documents, embeddings, index_name=index_name
)