In [None]:
# Install spaCy if not already installed
!pip install -U spacy

# Download spaCy English model
!python -m spacy download en_core_web_sm


Collecting spacy
  Downloading spacy-3.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: spacy
  Attempting uninstall: spacy
    Found existing installation: spacy 3.7.2
    Uninstalling spacy-3.7.2:
      Successfully uninstalled spacy-3.7.2
Successfully installed spacy-3.7.3
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load al

In [1]:
import pandas as pd
import spacy
from spacy.matcher import PhraseMatcher
import re

In [2]:
# Mount Google Drive to access files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Update the file path to your CSV file in Google Drive
file_path = '/content/drive/MyDrive/AAPL Financial Report.xlsx'

In [4]:
# Load the financial dataset from Google Drive
financial_data = pd.read_excel(file_path, index_col=0)
financial_data

Unnamed: 0_level_0,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009
FT / Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Market Cap(in B USD),2066.94,2913.28,2255.97,1304.76,748.54,868.87,617.59,586.86,647.36,504.79,500.61,376.4,296.89,189.8
Revenue,394328.0,365817.0,274515.0,260174.0,265595.0,229234.0,215639.0,233715.0,182795.0,170910.0,156508.0,108249.0,65225.0,42905.0
Gross Profit,170782.0,152836.0,104956.0,98392.0,101839.0,88186.0,84263.0,93626.0,70537.0,64304.0,68662.0,43818.0,25684.0,17222.0
Net Income,99803.0,94680.0,57411.0,55256.0,59531.0,48351.0,45687.0,53394.0,39510.0,37037.0,41733.0,25922.0,14013.0,8235.0
Earning Per Share,6.11,5.61,3.28,2.97,2.98,2.3025,2.0775,2.305,1.6125,1.42,1.5775,0.9886,0.5411,0.3243
EBITDA,130541.0,120233.0,77344.0,76477.0,81801.0,71501.0,70529.0,82487.0,60449.0,55756.0,58518.0,35604.0,19412.0,12474.0
Share Holder Equity,50672.0,63090.0,65339.0,90488.0,107147.0,134047.0,128249.0,119355.0,111547.0,123549.0,118210.0,76615.0,47791.0,31640.0
Cash Flow from Operating,122151.0,104038.0,80674.0,69391.0,77434.0,64225.0,66231.0,81266.0,59713.0,53666.0,50856.0,37529.0,18595.0,10159.0
Cash Flow from Investing,-22354.0,-14545.0,-4289.0,45896.0,16066.0,-46446.0,-45977.0,-56274.0,-22579.0,-33774.0,-48227.0,-40419.0,-13854.0,-17434.0
Cash Flow from Financial Activities,-110749.0,-93353.0,-86820.0,-90976.0,-87876.0,-17974.0,-20890.0,-17716.0,-37549.0,-16379.0,-1698.0,1444.0,1257.0,663.0


In [None]:
# Load the English language model
nlp = spacy.load("en_core_web_sm")
# Initialize the Matcher
matcher = PhraseMatcher(nlp.vocab)
# Define phrases for each row in the dataset (financial terms)
terms = list(financial_data.index)
term_patterns = [nlp(term) for term in terms]

# Add row patterns to the matcher
matcher.add("FinancialTerms", None, *term_patterns)

# Function to extract information from the dataset based on user question
def extract_information(question):
    doc = nlp(question)
    matches = matcher(doc)

    if matches:
        match_id, start, end = matches[0]
        matched_text = doc[start:end].text
        if matched_text in terms:
            # Extract relevant information from the dataset
            for col in financial_data.columns:
                if str(col) in question:
                    value = financial_data.loc[matched_text, col]
                    return f"The {matched_text} in {col} is {value}."
            # If no specific year mentioned, default to latest available year
            latest_year = financial_data.columns[0]  # Latest year is the first column
            latest_value = financial_data.loc[matched_text, latest_year]
            return f"The {matched_text} in {latest_year} is {latest_value}."
    else:
        return "I couldn't find any relevant information in the dataset."

# Function to handle the conversation
def financial_qa_system():
    print("Hello! I am Investo, Your Financial Question Answering Chatbot. How can I assist you today?")

    while True:
        user_input = input("You: ")
        if user_input.lower() in ["exit", "quit", "bye"]:
            print("Investo: Goodbye! Have a great day.")
            break
        elif user_input.lower() in ["hello", "hi", "hey", "hola", "greetings"]:
            print("Investo: Hello! How can I help you today?")
        else:
            response = extract_information(user_input)
            print("Investo:", response)

if __name__ == "__main__":
    financial_qa_system()

Hello! I am Investo, Your Financial Question Answering Chatbot. How can I assist you today?
You: hello
Investo: Hello! How can I help you today?
You: Revenue in 2019
Investo: The Revenue in 2019 is 260174.0.
You: bye
Investo: Goodbye! Have a great day.
