In [79]:
import re
import numpy as np
import pandas as pd
import yfinance as yf
import pandas as pd
import requests
import json
from pandas.tseries.offsets import BDay
# pd.set_option("display.max_rows", None)
# pd.set_option("display.max_columns", None)
# pd.set_option("display.max_colwidth", None)
from datetime import datetime, date, timedelta
import random
import yfinance as yf
from transformers import AutoTokenizer, AutoModelForCausalLM,pipeline
import torch

In [8]:
model_id = "Qwen/Qwen2.5-14B-Instruct"
torch.cuda.set_device(0)  # Sets default to GPU 0
device=torch.device("cuda:0")
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    
    model_id,
    device_map={"": 0},             # auto-distributes across GPUs
    torch_dtype="auto",            # picks bf16 or fp16 depending on availability
)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [80]:
class Technical_Analyst:

    def __init__(self, tickers, start_date, end_date, min_lookback_bdays=50, ind_max_window=50):
        self.tickers = [tickers] if isinstance(tickers,str) else tickers
        self.start_date = start_date
        self.end_date = end_date
        start_dt = self.to_date(start_date)
        end_dt   = self.to_date(end_date)

        # Ensure start <= end
        if end_dt < start_dt:
            start_dt, end_dt = end_dt, start_dt

        # Ensure at least N business days in the analysis window
        min_start_dt = (pd.Timestamp(end_dt) - BDay(min_lookback_bdays)).date()
        if (end_dt - start_dt).days < min_lookback_bdays:
            start_dt = min_start_dt

        # Keep analysis window (as strings, handy for APIs)
        self.start_date = start_dt.strftime("%Y-%m-%d")
        self.end_date   = end_dt.strftime("%Y-%m-%d")

        # Fetch window: pull extra history so indicators with large windows are valid
        buffer_bdays = ind_max_window + 5  # small cushion
        fetch_start_dt = (pd.Timestamp(start_dt) - BDay(buffer_bdays)).date()
        self.fetch_start = fetch_start_dt.strftime("%Y-%m-%d")
        self.generate_df()
        self.generate_indicators()
        self.generate_technical_prompt()
        
    def generate_df(self):
        self.data = yf.download(self.tickers, start=self.start_date, end=self.end_date)

    def to_date(self, d):
            if isinstance(d, date):
                return d
            if isinstance(d, str):
                return datetime.strptime(d, "%Y-%m-%d").date()
            # pandas Timestamp etc.
            return pd.Timestamp(d).date()
    
    def compute_rsi(self, close, period=14):
        delta = close.diff()

        gain = delta.clip(lower=0)
        loss = -delta.clip(upper=0)

        avg_gain = gain.rolling(window=period).mean()
        avg_loss = loss.rolling(window=period).mean()

        rs = avg_gain / avg_loss
        rsi = 100 - (100 / (1 + rs))
        return rsi

    def compute_obv(self, close, volume):
        direction = close.diff().apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
        obv = (volume * direction).fillna(0).cumsum()
        return obv
    
    def generate_indicators(self):
        self.indicators_processed={}
        for ticker in self.tickers:
            temp=pd.DataFrame()
            temp["SMA_5"] = self.data['Close'][ticker].rolling(5).mean()
            temp["SMA_15"] = self.data['Close'][ticker].rolling(15).mean()
            temp["SMA_50"] = self.data['Close'][ticker].rolling(50).mean()

            temp['EMA_5'] = self.data['Close'][ticker].ewm(span=5).mean()
            temp['EMA_10'] = self.data['Close'][ticker].ewm(span=10).mean()
            temp['EMA_50'] = self.data['Close'][ticker].ewm(span=50).mean()
            temp["Date"] = self.data['Close'][ticker].index
            temp["RSI"]=self.compute_rsi(self.data['Close'][ticker])
            temp["OBV"]=self.compute_obv(self.data['Close'][ticker], self.data['Volume'][ticker])
            self.indicators_processed[ticker]=temp
            
    def generate_technical_prompt(self):

        prompt = f""" Choose a recommendation for each stock. Respond in a vector of floats between [-1,1], -1 being Short and 1 being Strong Buy. 
Make these decisions based solely on the technical indicators given below for each stock. Return only the vector, no explanation is needed.
 You MUST return the vector at the end in this format [company1,company2,...companyn]: [Value1,Value2,Valuen]"""
        for ticker in self.tickers:
            latest=self.indicators_processed[ticker].iloc[-1]
            partial_stats= f"""Technical Indicators for {ticker}:
            SMA 5: {latest['SMA_5']:.2f}
            SMA 15: {latest['SMA_15']:.2f}
            SMA 50: {latest['SMA_50']:.2f}

            EMA 5: {latest['EMA_5']:.2f}
            EMA 10: {latest['EMA_10']:.2f}
            EMA 50: {latest['EMA_50']:.2f}

            RSI: {latest['RSI']:.2f}
            OBV: {latest['OBV']:,.0f}
            /n/n"""
            prompt+=partial_stats
        self.prompt=prompt
        return self.prompt 
    
    def generate_response(self):
        prompt = self.prompt
        messages = [
            {"role": "system", "content": "You are an expert technical analyst. Analyze stocks to the best of your ability."},
            {"role": "user", "content": prompt}
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=512
        )
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        self.response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
        return self.response


In [10]:
class Fundamental_Analyst:

    def __init__(self, tickers, start_date, end_date):
        self.tickers = list(tickers) if isinstance(tickers,str) else tickers
        self.start_date = start_date
        self.end_date = end_date

        start_year = datetime.strptime(start_date, "%Y-%m-%d").year
        end_year = datetime.strptime(end_date, "%Y-%m-%d").year
        self.years = list(range(start_year, end_year + 1))
        data_processed={}
        for ticker in self.tickers:
            metric_data, gaap_data=self.get_data(ticker)
            data_processed[ticker]=self.get_financial_info(metric_data,gaap_data)
        self.data_processed=data_processed
        self.generate_prompt()
        
    def get_data(self,ticker):
        api_key = "d1l719pr01qt4thec1pgd1l719pr01qt4thec1q0"
        url = f"https://finnhub.io/api/v1/stock/metric?symbol={ticker}&token={api_key}"
        response = requests.get(url)
        metric_data=response.json()
        url = f"https://finnhub.io/api/v1/stock/financials-reported?symbol={ticker}&token={api_key}"
        response = requests.get(url)
        gaap_data=response.json()
        return metric_data,gaap_data

    def find_us_gaap_entry(self, gaap_data, parameter, year):
        for dic in gaap_data['data']:
            if dic.get('year') == year:
                for section in ['ic', 'bs', 'cf']:  # income statement, balance sheet, cash flow
                    for entry in dic.get('report', {}).get(section, []):
                        if parameter in entry.values():
                            return entry.get('value')
        return None

    def find_metric_by_year(self,metric_data, parameter, year):
        series = metric_data.get('series', {}).get('annual', {}).get(parameter, [])
        for item in series:
            if item.get('period', '').startswith(str(year)):
                return item.get('v')  # assuming value is under 'v'
        return None

    def get_financial_info(self,metric_data,gaap_data):
        financials = {
        "EPS": [],
        "Net_Income": [],
        "Gross_Profit": [],
        "Revenue": [],
        "Total_Assets": [],
        "Total_Liabilities": [],
        "Shareholders_Equity": [],
        "Operating_Cash_Flow": [],
        "Investing_Cash_Flow": [],
        "Financing_Cash_Flow": [],
        "P_E": [],
        "ROA": [],
        "ROE": []
    }

        for year in self.years:
            financials["EPS"].append(self.find_us_gaap_entry(gaap_data, 'us-gaap_EarningsPerShareDiluted', year))
            financials["Net_Income"].append(self.find_us_gaap_entry(gaap_data, 'us-gaap_NetIncomeLoss', year))
            financials["Gross_Profit"].append(self.find_us_gaap_entry(gaap_data, 'us-gaap_GrossProfit', year))
            financials["Revenue"].append(self.find_us_gaap_entry(gaap_data, 'us-gaap_RevenueFromContractWithCustomerExcludingAssessedTax', year))
            financials["Total_Assets"].append(self.find_us_gaap_entry(gaap_data, 'us-gaap_Assets', year))
            financials["Total_Liabilities"].append(self.find_us_gaap_entry(gaap_data, 'us-gaap_Liabilities', year))
            financials["Shareholders_Equity"].append(self.find_us_gaap_entry(gaap_data, 'us-gaap_StockholdersEquity', year))
            financials["Operating_Cash_Flow"].append(self.find_us_gaap_entry(gaap_data, 'us-gaap_NetCashProvidedByUsedInOperatingActivities', year))
            financials["Investing_Cash_Flow"].append(self.find_us_gaap_entry(gaap_data, 'us-gaap_NetCashProvidedByUsedInInvestingActivities', year))
            financials["Financing_Cash_Flow"].append(self.find_us_gaap_entry(gaap_data, 'us-gaap_NetCashProvidedByUsedInFinancingActivities', year))
            financials["P_E"].append(self.find_metric_by_year(metric_data, 'pe', year))
            financials["ROA"].append(self.find_metric_by_year(metric_data, 'roa', year))
            financials["ROE"].append(self.find_metric_by_year(metric_data, 'roe', year))

        return financials
    
    def fmt(self, value, fmt_str):
        return format(value, fmt_str) if value is not None else "N/A"

    def generate_prompt(self):
        prompt = f""" Choose a recommendation for each stock. Respond in a vector of floats between [-1,1], -1 being Short and 1 being Strong Buy. 
Make these decisions based solely on the fundamental indicators given below for each stock. Return only the vector, no explanation is needed.
 You MUST return the vector at the end in this format [company1,company2,...companyn]: [Value1,Value2,Valuen]\n\n"""
        for ticker in self.tickers:
            financials=self.data_processed[ticker]
            partial_stats=f"""Financials for {ticker}:\n"""
            for i, year in enumerate(self.years):
                partial_stats += f"""
                Year: {year}
                Income Statement:
                Revenue: ${self.fmt(financials["Revenue"][i], ",.0f")}
                Gross Profit: ${self.fmt(financials["Gross_Profit"][i], ",.0f")}
                Net Income: ${self.fmt(financials["Net_Income"][i], ",.0f")}
                EPS (Diluted): {self.fmt(financials["EPS"][i], ".2f")}

                Balance Sheet:
                Total Assets: ${self.fmt(financials["Total_Assets"][i], ",.0f")}
                Total Liabilities: ${self.fmt(financials["Total_Liabilities"][i], ",.0f")}
                Shareholders' Equity: ${self.fmt(financials["Shareholders_Equity"][i], ",.0f")}

                Cash Flow:
                Operating Cash Flow: ${self.fmt(financials["Operating_Cash_Flow"][i], ",.0f")}
                Investing Cash Flow: ${self.fmt(financials["Investing_Cash_Flow"][i], ",.0f")}
                Financing Cash Flow: ${self.fmt(financials["Financing_Cash_Flow"][i], ",.0f")}

                Valuation and Ratios:
                P/E Ratio: {self.fmt(financials["P_E"][i], ".2f")}
                ROA: {self.fmt(financials["ROA"][i], ".2%")}
                ROE: {self.fmt(financials["ROE"][i], ".2%")}
                """
            prompt+=partial_stats+"\n\n"
        prompt += "\nBased on this, what is your investment recommendation? Pick one action candidate."
        self.prompt = prompt
        return prompt

    def generate_response(self):
        prompt = self.prompt
        messages = [
            {"role": "system", "content": "You are an expert fundamental analyst. Analyze stocks to the best of your ability."},
            {"role": "user", "content": prompt}
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=512
        )
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        self.response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
        return self.response

In [69]:
class News_Analyst:

    def __init__ (self,tickers, companies, start_date, end_date):
        self.tickers=[tickers] if isinstance(tickers,str) else tickers
        self.companies=[companies] if isinstance(companies,str) else companies
        self.start_date=start_date
        self.end_date=end_date
        news_collection={}
        for i,ticker in enumerate(self.tickers):
            news_collection[ticker]=self.get_news_articles(ticker,companies[i])
        self.news_collection=news_collection
        self.generate_news_prompt()
        
    def get_news_articles(self,ticker,company):
        file_path = f"/home/f20222001/test-venv/Portfolio/sp500_news/sp500_news/{ticker}.jsonl"
        start_dt = datetime.strptime(self.start_date, "%Y-%m-%d")
        end_dt   = datetime.strptime(self.end_date, "%Y-%m-%d")

        want = 10
        titles, seen = [], set()

        # 1) Local JSONL (assumed to exist)
        with open(file_path, "r") as f:
            for line in f:
                obj = json.loads(line)
                title = (obj.get("Article_title") or "").strip()
                if not title:
                    continue
                article_dt = datetime.strptime(obj["Date"], "%Y-%m-%d")
                if start_dt <= article_dt <= end_dt and company.lower() in title.lower():
                    if title not in seen:
                        seen.add(title)
                        titles.append(title)

        # 2) If fewer than 10, top up with Finnhub for the same window
        if len(titles) < want:
            #print("entered")
            url = (
                "https://finnhub.io/api/v1/company-news"
                f"?symbol={ticker}&from={self.start_date}&to={self.end_date}"
                f"&token=d1l719pr01qt4thec1pgd1l719pr01qt4thec1q0"
            )
            r = requests.get(url, timeout=10)
            if r.ok:
                for item in (r.json() or []):
                    title = (item.get("headline") or "").strip()
                    print(title)
                    if not title or title in seen:
                        continue
                    ts = item.get("datetime")
                    if isinstance(ts, (int, float)):
                        art_date = datetime.fromtimestamp(ts, tz=timezone.utc).date()
                        if start_dt.date() <= art_date <= end_dt.date() and company.lower() in title.lower():
                            seen.add(title)
                            titles.append(title)
                            if len(titles) >= want:
                                break

        # 3) Cap at 10 (randomize if overshoot)
        if len(titles) > want:
            titles = random.sample(titles, want)

        selected_news = titles
        return selected_news
    
    def generate_news_prompt(self):
        prompt = f""" Choose a recommendation for each stock. Respond in a vector of floats between [-1,1], -1 being Short and 1 being Strong Buy. 
Make these decisions based solely on the news headlines and sentiment given below for each stock. Return only the vector, no explanation is needed.
You MUST return the vector at the end in this format [company1,company2,...companyn].

Headlines:
"""
        for ticker in self.tickers:
            prompt+=f""" News for {ticker}\n"""
            for i, item in enumerate(self.news_collection[ticker]):
                prompt+=item
                prompt+='\n'
        prompt+="Based on this, what is your investment recommendation? Pick one action candidate."
        self.prompt=prompt
        return self.prompt
    
    def generate_response(self):
        prompt = self.prompt
        messages = [
            {"role": "system", "content": "You are an expert news and sentiment analyst. Analyze stocks to the best of your ability."},
            {"role": "user", "content": prompt}
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=512
        )
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        self.response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
        return self.response

In [70]:
def generate_row(ticker_list, company_list, start_date, end_date):
    technical_instance=Technical_Analyst(tickers=ticker_list, start_date=start_date, end_date=end_date)
    fundamental_instance=Fundamental_Analyst(tickers=ticker_list, start_date=start_date, end_date=end_date)
    news_instance=News_Analyst(tickers=ticker_list, companies=company_list, start_date=start_date, end_date=end_date)
    technical_data=technical_instance.indicators_processed
    technical_response=technical_instance.generate_response()
    fundamental_data=fundamental_instance.data_processed
    fundamental_response=fundamental_instance.generate_response()
    news_response=news_instance.generate_prompt()

In [81]:
ticker_list=["AAPL", "NVDA", "LMT", "LLY", "GLD", "USO", "TLT"]
start_date="2024-08-01"
end_date="2024-09-01"
company_list=["Apple", "Nvidia", "Lockheed", "Eli Lilly", "Gold", "Oil", "Bonds"]
technical_instance=Technical_Analyst(tickers=ticker_list, start_date=start_date, end_date=end_date)
fundamental_instance=Fundamental_Analyst(tickers=ticker_list, start_date=start_date, end_date=end_date)
news_instance=News_Analyst(tickers=ticker_list, companies=company_list, start_date=start_date, end_date=end_date)
technical_data=technical_instance.indicators_processed
technical_response=technical_instance.generate_response()
fundamental_data=fundamental_instance.data_processed
fundamental_response=fundamental_instance.generate_response()
news_response=news_instance.generate_response()

  self.data = yf.download(self.tickers, start=self.start_date, end=self.end_date)
[*********************100%***********************]  7 of 7 completed


In [82]:
news_instance.news_collection

{'AAPL': [], 'NVDA': [], 'LMT': [], 'LLY': [], 'GLD': [], 'USO': [], 'TLT': []}

In [73]:
technical_data['USO']

Unnamed: 0_level_0,SMA_5,SMA_15,SMA_50,EMA_5,EMA_10,EMA_50,Date,RSI,OBV
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-08-01,,,,75.129997,75.129997,75.129997,2022-08-01,,0
2022-08-02,,,,75.609999,75.569999,75.537999,2022-08-02,,2737300
2022-08-03,,,,74.624736,74.749932,74.841717,2022-08-03,,-1296400
2022-08-04,,,,73.33923,73.689108,73.963463,2022-08-04,,-6511900
2022-08-05,73.531999,,,72.64853,73.072158,73.439236,2022-08-05,,-3372700
2022-08-08,73.214,,,72.974286,73.193673,73.457753,2022-08-08,,1056800
2022-08-09,72.717999,,,73.142714,73.255437,73.456508,2022-08-09,,-1435400
2022-08-10,72.875999,,,73.551076,73.49763,73.580147,2022-08-10,,3186900
2022-08-11,73.825999,,,74.485011,74.102976,73.930317,2022-08-11,,7874200
2022-08-12,74.514,,,74.65292,74.287201,74.055165,2022-08-12,,4924600


In [74]:
technical_response

'[aapl,nvda,lmt,lly,gld,uso,tlt]: [0.2,-0.1,0.7,0.4,-0.3,0.5,-0.2]'

In [75]:
fundamental_data

{'AAPL': {'EPS': [6.11],
  'Net_Income': [99803000000.0],
  'Gross_Profit': [170782000000],
  'Revenue': [394328000000.0],
  'Total_Assets': [352755000000],
  'Total_Liabilities': [302083000000],
  'Shareholders_Equity': [50672000000.0],
  'Operating_Cash_Flow': [122151000000],
  'Investing_Cash_Flow': [-22354000000],
  'Financing_Cash_Flow': [-110749000000],
  'P_E': [24.0854],
  'ROA': [0.2829],
  'ROE': [1.9696]},
 'NVDA': {'EPS': [3.85],
  'Net_Income': [9752000000.0],
  'Gross_Profit': [17475000000.0],
  'Revenue': [None],
  'Total_Assets': [44187000000.0],
  'Total_Liabilities': [17575000000.0],
  'Shareholders_Equity': [26612000000.0],
  'Operating_Cash_Flow': [9108000000.0],
  'Investing_Cash_Flow': [-9830000000.0],
  'Financing_Cash_Flow': [1865000000.0],
  'P_E': [62.7717],
  'ROA': [0.2207],
  'ROE': [0.3665]},
 'LMT': {'EPS': [21],
  'Net_Income': [5732000000.0],
  'Gross_Profit': [8287000000],
  'Revenue': [None],
  'Total_Assets': [52880000000.0],
  'Total_Liabilities': [

In [76]:
fundamental_response

'[aapl,nvda,lmt,lly,gld,uso,tlt]: [0.8,0.5,0.7,0.6,-1,-1,-1]'

In [77]:
news_response

'[0.6,-0.3,0,0,-0.1,0,0]'

In [78]:
news_instance.news_collection

{'AAPL': ['Why Apple Stock Jumped 18.9% in July',
  'One Put, One Call Option To Know About for Apple',
  "Apple supplier Foxconn's Q2 profit up nearly 12%",
  'EXCLUSIVE-Tinder-owner Match ups antitrust pressure on Apple in India with new case',
  'U.S. Justice Department in early stages of drafting possible antitrust suit against Apple -Politico',
  '1 Metric That Apple Investors Should Stop Worrying About',
  'Q2 Earnings Scorecard and Research Reports for Apple, Chevron & Toyota',
  'Apple suppliers to make Apple Watch and MacBook in Vietnam - Nikkei',
  'Is Apple Stock a Buy After Its Latest Earnings?',
  'Is Most-Watched Stock Apple Inc. (AAPL) Worth Betting on Now?'],
 'NVDA': ['Wall St trades mixed on Fed tightening fears, Nvidia weighs',
  'Intel and AMD Earnings Share Great Insight for Nvidia Investors',
  'Nvidia expects second-quarter revenue to drop on gaming weakness',
  'This Is Why AMD and Nvidia Are Down Today',
  'US STOCKS-Wall Street ends sharply up, fueled by Nvidi