In [None]:
import os
import openai
import pandas as pd
import pandasql as psql
from fpdf import FPDF
import matplotlib.pyplot as plt
import sys

# -------------------------------
# 1. Securely Load OpenAI API Key
# -------------------------------

openai_api_key = 'your API key here'

if not openai_api_key:
    raise ValueError("OpenAI API key not found. Please set the 'OPENAI_API_KEY' environment variable.")

openai.api_key = openai_api_key
# -------------------------------

# -------------------------------
# 2. Load CSV Files into DataFrames
# -------------------------------

def load_data():
    try:
        t_zacks_fc = pd.read_csv("t_zacks_fc.csv")
        t_zacks_fr = pd.read_csv("t_zacks_fr.csv")
        t_zacks_mktv = pd.read_csv("t_zacks_mktv.csv")
        t_zacks_shrs = pd.read_csv("t_zacks_shrs.csv")
        t_zacks_sectors = pd.read_csv("t_zacks_sectors.csv")
        print("CSV files loaded successfully.")
        print("\nDataFrame Shapes:")
        print(f"t_zacks_fc: {t_zacks_fc.shape}")
        print(f"t_zacks_fr: {t_zacks_fr.shape}")
        print(f"t_zacks_mktv: {t_zacks_mktv.shape}")
        print(f"t_zacks_shrs: {t_zacks_shrs.shape}")
        print(f"t_zacks_sectors: {t_zacks_sectors.shape}")

        print("\nDataFrame dtypes:")
        print(f"t_zacks_mktv:\n{t_zacks_mktv.dtypes}")

        # Convert 'per_end_date' to string format if it's datetime type
        if pd.api.types.is_datetime64_any_dtype(t_zacks_mktv['per_end_date']):
            t_zacks_mktv['per_end_date'] = t_zacks_mktv['per_end_date'].dt.strftime('%Y-%m-%d')

        # Remove leading/trailing spaces from column names
        t_zacks_mktv.columns = t_zacks_mktv.columns.str.strip()

        return {
            "t_zacks_fc": t_zacks_fc,
            "t_zacks_fr": t_zacks_fr,
            "t_zacks_mktv": t_zacks_mktv,
            "t_zacks_shrs": t_zacks_shrs,
            "t_zacks_sectors": t_zacks_sectors,
        }
    except Exception as e:
        print(f"Error loading CSV files: {e}")
        return {}

# -------------------------------
# 3. Generate SQL Query from Natural Language
# -------------------------------

def generate_sql_query(user_question, dataframes):
    """
    Uses OpenAI API to generate a SQL query based on the user's natural language question.

    Args:
        user_question (str): The user's input question in natural language.
        dataframes (dict): Dictionary of DataFrames with table names as keys.

    Returns:
        str: The generated SQL query.
    """
    # Initialize sql_query to ensure it's always defined
    sql_query = ""

    # Prepare schema information
    schema_info = ""
    for table_name, df in dataframes.items():
        columns = ", ".join(df.columns.tolist())
        schema_info += f"Table: {table_name}\nColumns: {columns}\n\n"

    prompt = f"""
Pretend you are an expert at converting natural language questions into accurate SQL queries. Please generate an accurate SQL query based on the following natural language question and database schema provided below. Think sequentially and refer to the sample natural language questions with correct and incorrect outputs as well.


Database Schema:
Table 1: t_zacks_fc (This table contains fundamental indicators for companies)
Columns: 'ticker' = Unique zacks Identifier for each company/stock, ticker or trading symbol, 'comp_name' = Company name, 'exchange' = Exchange traded, 'per_end_date' = Period end date which represents quarterly data, 'per_type' = Period type (eg. Q for quarterly data), 'filing_date' = Filing date, 'filing_type' = Filing type: 10-K, 10-Q, PRELIM, 'zacks_sector_code' = Zacks sector code (Numeric Value eg. 11 = Aerospace), 'eps_diluted_net_basic’ = Earnings per share (EPS) net (Company's net earnings or losses attributable to common shareholders per basic share basis), 'lterm_debt_net_tot' = Net long-term debt (The net amount of long term debt issued and repaid. This field is either calculated as the sum of the long term debt fields or used if a company does not report debt issued and repaid separately).
Keys: ticker, per_end_date, per_type
Table 2: t_zacks_fr (This table contains fundamental ratios for companies)
Columns: 'ticker' = Unique zacks Identifier for each company/stock, ticker or trading symbol, 'per_end_date' = Period end date which represents quarterly data, 'per_type' = Period type (eg. Q for quarterly data), ‘ret_invst’ = Return on investments (An indicator of how profitable a company is relative to its assets invested by shareholders and long-term bond holders. Calculated by dividing a company's operating earnings by its long-term debt and shareholders equity), ‘tot_debt_tot_equity’ = Total debt / total equity (A measure of a company's financial leverage calculated by dividing its long-term debt by stockholders' equity).
Keys: ticker, per_end_date, per_type.
Table 3: t_zacks_mktv (This table contains market value data for companies)
Columns: 'ticker' = Unique zacks Identifier for each company/stock, ticker or trading symbol, 'per_end_date' = Period end date which represents quarterly data, 'per_type' = Period type (eg. Q for quarterly data), ‘mkt_val’ = Market Cap of Company (shares out x last monthly price per share - unit is in Millions).
Keys: ticker, per_end_date, per_type.
Table 4: t_zacks_shrs (This table contains shares outstanding data for companies)
Columns: 'ticker' = Unique zacks Identifier for each company/stock, ticker or trading symbol, 'per_end_date' = Period end date which represents quarterly data, 'per_type' = Period type (eg. Q for quarterly data), ‘shares_out’ = Number of Common Shares Outstanding from the front page of 10K/Q.
Keys: ticker, per_end_date, per_type.
Table 5: t_zacks_sectors (This table contains the zacks sector codes and their corresponding sectors)
Columns: 'zacks_sector_codes' = Unique identifier for each zacks sector, 'sector': the sector descriptions that correspond to the sector code 
Keys: zacks_sector_code 

Sample natural language questions with correct and incorrect outputs: 
Sample prompt 1: Output ticker with the largest market value recorded on any given period end date. 
Correct output for prompt 1: SELECT ticker, per_end_date, MAX(mkt_val) AS max_market_value FROM t_zacks_mktv GROUP BY per_end_date ORDER BY max_market_value DESC LIMIT 1;
Incorrect output for prompt 1: SELECT MAX(mkt_val) , ticker FROM t_zacks_mktv GROUP BY ticker

Sample prompt 2: What is the company name with the lowest market cap?
Correct output for prompt 2: SELECT fc.comp_name, mktv.ticker, mktv.mkt_val FROM t_zacks_mktv AS mktv JOIN t_zacks_fc AS fc ON mktv.ticker = fc.ticker WHERE mktv.mkt_val = (SELECT MIN(mkt_val) FROM t_zacks_mktv);
Incorrect output for prompt 2:  SELECT T1.comp_name FROM t_zacks_fc AS T1 INNER JOIN t_zacks_mktv AS T2 ON T1.ticker = T2.ticker AND T1.per_end_date = T2.per_end_date AND T1.per_type = T2.per_type ORDER BY T2.mkt_val LIMIT 1

Sample prompt 3: Filter t_zacks_fc to only show companies with a total debt-to-equity ratio greater than 1.
Correct output for prompt 3: SELECT * FROM t_zacks_fr WHERE tot_debt_tot_equity > 1;
Incorrect output for prompt 3: SELECT * FROM t_zacks_fr WHERE t_zacks_mktv > 1;

Sample prompt 4: Filter t_zacks_shrs to include companies with more than 500 million shares outstanding as of the most recent quarter.
Correct output for prompt 4: SELECT *
FROM t_zacks_shrs
WHERE shares_out > 5000
ORDER BY per_end_date DESC;
Incorrect output for prompt 4: SELECT * FROM t_zacks_shrs WHERE shares_out > 500000000

Sample prompt 5: Combine t_zacks_mktv and t_zacks_shrs to show tickers with market cap and shares outstanding in the latest period end date.
Correct output for prompt 5: SELECT mktv.ticker, mktv.per_end_date, mktv.mkt_val, shrs.shares_out
FROM t_zacks_mktv mktv
JOIN t_zacks_shrs shrs ON mktv.ticker = shrs.ticker AND mktv.per_end_date = shrs.per_end_date
ORDER BY mktv.per_end_date DESC;
Incorrect output for prompt 5: SELECT ticker, mkt_val, shares_out FROM t_zacks_mktv INNER JOIN t_zacks_shrs ON t_zacks_mktv.ticker = t_zacks_shrs.ticker AND t_zacks_mktv.per_end_date = t_zacks_shrs.per_end_date ORDER BY per_end_date DESC LIMIT 1

Sample prompt 6: Join t_zacks_fc and t_zacks_fr to show tickers with total debt-to-equity ratios and EPS from NASDAQ as of Q2 2024.
Correct output for prompt 6: SELECT fc.ticker, fc.eps_diluted_net_basic, fr.tot_debt_tot_equity
FROM t_zacks_fc fc
JOIN t_zacks_fr fr ON fc.ticker = fr.ticker AND fc.per_end_date = fr.per_end_date
WHERE fc.exchange = 'NASDAQ' AND fc.per_type = 'Q' AND fc.per_end_date BETWEEN '2024-04-01' AND '2024-06-30';
Incorrect output for prompt 6: SELECT T1.ticker, T1.eps_diluted_net_basic, T2.ret_invst, T2.tot_debt_tot_equity FROM t_zacks_fc AS T1 INNER JOIN t_zacks_fr AS T2 ON T1.ticker = T2.ticker AND T1.per_end_date = T2.per_end_date WHERE T1.exchange = 'NASDAQ' AND T1.per_type = 'Q2';

User's Question:
{user_question}

Please do this task step by step and provide only the SQL query without any explanations.
"""

    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=150,
            temperature=0.7,
            n=1,
            stop=None
        )
        # Access the generated SQL query
        sql_query = response.choices[0].message.content.strip()
        print("\nGenerated SQL Query:")
        print(sql_query)
    except openai.OpenAIError as e:
        print(f"An error occurred while generating SQL query: {e}")
    return sql_query


# -------------------------------
# 4. Execute SQL Query on DataFrames
# -------------------------------

def run_query(query, dataframes):
    try:
        result = psql.sqldf(query, dataframes)
        print("\nSQL query executed successfully.")
        print(f"Number of records retrieved: {result.shape[0]}")
        return result
    except Exception as e:
        print(f"Error executing SQL query: {e}")
        return pd.DataFrame()

# -------------------------------
# 5. Generate Analysis with OpenAI API
# -------------------------------

def generate_analysis_from_openai(dataframe, user_question):
    if dataframe.empty:
        return "No data available for analysis."

    table_md = dataframe.to_markdown(index=False)
    prompt = f"""
I have executed a SQL query based on the following user question and obtained the data below.

User's Question:
{user_question}

Data Table:
{table_md}

Pretend you are an experienced equity analyst working in the banking industry. Please analyze this data in the style of an expert equity analyst, highlighting trends, comparing companies, analyzing significance of metrics, and noting any interesting insights regarding this data.
"""

    try:
        # Make the API call to OpenAI
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=150,
            temperature=0.7,
            n=1,
            stop=None
        )
        # Assign the response content to 'analysis'
        analysis = response.choices[0].message['content'].strip()
        print("\nGenerated Analysis:")
        print(analysis)
        return analysis
    except openai.OpenAIError as e:
        return f"An error occurred while generating analysis: {e}"

# -------------------------------
# 6. Define PDF Generation Class
# -------------------------------

class PDF(FPDF):
    def header(self):
        self.set_font("Arial", "B", 16)
        self.cell(0, 10, "Equity Analyst Report", align="C", ln=True)
        self.ln(10)

    def chapter_title(self, title):
        self.set_font("Arial", "B", 12)
        self.cell(0, 10, title, 0, 1, "L")
        self.ln(4)

    def chapter_body(self, body):
        self.set_font("Arial", "", 12)
        for line in body.split('\n'):
            self.multi_cell(0, 10, line)
        self.ln()

    def table(self, data):
        if data.empty:
            self.set_font("Arial", "I", 12)
            self.cell(0, 10, "No data available to display.", 0, 1, 'C')
            self.ln()
            return

        self.set_font("Arial", "B", 10)
        col_widths = self.calculate_col_widths(data)

        # Add table headers
        for header in data.columns:
            self.cell(col_widths[header], 10, header, 1, 0, 'C')
        self.ln()

        # Add table rows
        self.set_font("Arial", "", 10)
        for _, row in data.iterrows():
            for header in data.columns:
                cell_text = str(row[header]) if pd.notnull(row[header]) else ""
                # Truncate text if it's too long
                if len(cell_text) > 15:
                    cell_text = cell_text[:12] + '...'
                self.cell(col_widths[header], 10, cell_text, 1, 0, 'C')
            self.ln()

        self.ln()

    def calculate_col_widths(self, data):
        col_widths = {}
        for col in data.columns:
            max_length = data[col].astype(str).map(len).max()
            header_length = len(col)
            col_width = max(max_length, header_length) * 2
            col_width = min(max(col_width, 30), 60)
            col_widths[col] = col_width
        return col_widths

    def add_image(self, image_path, title, width=100):
        """
        Adds an image to the PDF with an optional title.

        Args:
            image_path (str): Path to the image file.
            title (str): Title for the image.
            width (int): Width of the image in the PDF.
        """
        if not os.path.exists(image_path):
            print(f"Image file {image_path} does not exist.")
            return

        self.chapter_title(title)
        self.image(image_path, w=width)
        self.ln(10)

# -------------------------------
# 7. Generate Charts
# -------------------------------

def generate_charts(dataframe, output_dir="charts"):
    """
    Generates and saves pie and bar charts based on the provided DataFrame.

    Args:
        dataframe (pd.DataFrame): The DataFrame containing market value data.
        output_dir (str): Directory where the charts will be saved.

    Returns:
        list: List of paths to the generated chart images.
    """
    if dataframe.empty:
        print("No data available to generate charts.")
        return []

    # Ensure the output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    chart_paths = []

    # Example: If 'mkt_val' and 'ticker' are present, generate charts
    if 'mkt_val' in dataframe.columns and 'ticker' in dataframe.columns:
        try:
            # Aggregate data by 'ticker' to ensure unique companies
            aggregated_data = dataframe.groupby('ticker', as_index=False)['mkt_val'].sum()

            # Select top 5 companies by market value
            top_companies = aggregated_data.nlargest(5, 'mkt_val')

            print("\nTop 5 Companies by Market Value:")
            print(top_companies)

            # --- Pie Chart ---
            plt.figure(figsize=(6,6))
            plt.pie(top_companies['mkt_val'], labels=top_companies['ticker'], autopct='%1.1f%%', startangle=140)
            plt.title('Market Value Distribution Among Top 5 Companies')
            pie_chart_path = os.path.join(output_dir, 'market_value_pie_chart.png')
            plt.savefig(pie_chart_path)
            plt.close()
            chart_paths.append(pie_chart_path)

            # --- Bar Chart ---
            plt.figure(figsize=(8,6))
            plt.bar(top_companies['ticker'], top_companies['mkt_val'], color='skyblue')
            plt.xlabel('Ticker')
            plt.ylabel('Market Value')
            plt.title('Market Value of Top 5 Companies')
            bar_chart_path = os.path.join(output_dir, 'market_value_bar_chart.png')
            plt.savefig(bar_chart_path)
            plt.close()
            chart_paths.append(bar_chart_path)

            print(f"\nCharts generated and saved in the '{output_dir}' directory.")
        except Exception as e:
            print(f"Error generating charts: {e}")
    else:
        print("Required columns for chart generation ('mkt_val', 'ticker') are not present in the data.")

    return chart_paths

# -------------------------------
# 8. Interactive Chat with OpenAI API
# -------------------------------

def interactive_chat(dataframes):
    """
    Initiates an interactive conversation, allowing natural language queries.

    Args:
        dataframes (dict): Dictionary of DataFrames with table names as keys.
    """
    print("\nStart chatting with the assistant. You can ask questions about the data.")
    print("Type 'exit' or 'quit' to stop.\n")

    while True:
        user_input = input("You: ").strip()
        if user_input.lower() in ["exit", "quit"]:
            print("Ending the chat. Goodbye!")
            break
        elif user_input.lower() in ["help", "h"]:
            print("\nYou can ask questions related to the data, such as:")
            print("- What are the top 5 companies by market value?")
            print("- Show me the financial metrics for company X.")
            print("- Compare the market values of companies in the technology sector.\n")
            continue

        # Generate SQL query from user input
        sql_query = generate_sql_query(user_input, dataframes)
        if not sql_query:
            print("Failed to generate SQL query. Please try a different question.")
            continue

        # Execute SQL query
        query_result = run_query(sql_query, dataframes)
        if query_result.empty:
            print("No data returned from the SQL query.")
            continue
        else:
            print("\nQuery Result:")
            print(query_result)

        # Generate analysis
        analysis_text = generate_analysis_from_openai(query_result, user_input)

        # Generate charts
        chart_paths = generate_charts(query_result)

        # Initialize PDF
        pdf_filename = "equity_analyst_report.pdf"
        pdf = PDF()
        pdf.add_page()

        # Generate PDF Report with Charts
        generate_pdf_report(pdf, analysis_text, query_result, chart_paths, filename=pdf_filename)

        print(f"\nReport generated and saved as {pdf_filename}")

# -------------------------------
# 9. Generate PDF Report
# -------------------------------

def generate_pdf_report(pdf, analysis_text, data_table, chart_paths, filename="equity_analyst_report.pdf"):
    """
    Generates a PDF report with analysis, charts, and data tables.

    Args:
        pdf (PDF): An instance of the PDF class.
        analysis_text (str): The analysis text generated by OpenAI.
        data_table (pd.DataFrame): The DataFrame containing query results.
        chart_paths (list): List of paths to chart images.
        filename (str): The filename for the generated PDF.
    """
    # Add Overview
    pdf.chapter_title("Overview of Selected Companies")
    overview_text = (
        "This report provides an analysis of selected companies based on the user's query, including data on revenue, net income, and market capitalization."
    )
    pdf.chapter_body(overview_text)

    # Add Analysis
    pdf.chapter_title("Analysis")
    pdf.chapter_body(analysis_text)

    # Add Charts
    if chart_paths:
        pdf.chapter_title("Visualizations")
        for chart_path in chart_paths:
            # Determine the title based on the chart filename
            if 'pie_chart' in chart_path.lower():
                chart_title = "Market Value Distribution Pie Chart"
            elif 'bar_chart' in chart_path.lower():
                chart_title = "Market Value Comparison Bar Chart"
            else:
                chart_title = "Chart"

            pdf.add_image(chart_path, chart_title, width=100)  # Adjust width as needed

    # Add Data Table
    pdf.chapter_title("Company Financial Data")
    pdf.table(data_table)

    # Save PDF
    try:
        pdf.output(filename)
        print(f"Report generated and saved as {filename}")
    except Exception as e:
        print(f"Error saving PDF: {e}")

# -------------------------------
# 10. Main Function to Integrate All Functionalities
# -------------------------------

def main():
    # Load data
    dataframes = load_data()
    if not dataframes:
        print("Failed to load data. Exiting.")
        return

    # Start interactive chat
    interactive_chat(dataframes)

# -------------------------------
# 11. Entry Point
# -------------------------------

if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\nProgram interrupted by user. Exiting.")
        sys.exit()


CSV files loaded successfully.

DataFrame Shapes:
t_zacks_fc: (7504, 10)
t_zacks_fr: (7223, 5)
t_zacks_mktv: (9548, 4)
t_zacks_shrs: (9596, 4)
t_zacks_sectors: (18, 2)

DataFrame dtypes:
t_zacks_mktv:
ticker           object
per_end_date     object
per_type         object
mkt_val         float64
dtype: object

Start chatting with the assistant. You can ask questions about the data.
Type 'exit' or 'quit' to stop.

An error occurred while generating SQL query: Incorrect API key provided: your API*here. You can find your API key at https://platform.openai.com/account/api-keys.
Failed to generate SQL query. Please try a different question.
