In [4]:
import pandas as pd

# Load the data from the CSV file
data = pd.read_csv('/Users/ronitgandhi/Desktop_windows/Full Stack ML Project/Asset Allocation Algorithm.csv', header=[0, 1])

# Simplify multi-level headers by combining them
data.columns = ['_'.join(col).strip() if 'Unnamed' not in col[1] else col[0] for col in data.columns.values]

# Clean column names by removing unnecessary text and trimming spaces
data.columns = [col.replace('Unnamed: ', '').replace('level_0_', '').replace(' ', '_') 
                if 'Unnamed' in col else col for col in data.columns]

# Define age groups and their corresponding column indices based on the observed structure
age_group_info = {
    '18-25': [1, 2, 3, 4, 5, 6],
    '25-35': [7, 8, 9, 10, 11, 12],
    '35-45': [13, 14, 15, 16, 17, 18],
    '45-55': [19, 20, 21, 22, 23, 24],
    '55-65': [25, 26, 27, 28, 29, 30],
    '>65': [31, 32, 33, 34, 35, 36]
}

# Risk types corresponding to indices offsets from the age group's first column
risk_types = ['Liquid', 'Short Term', 'Income', 'Balanced', 'Equity – Mod', 'Equity – Agg']

# Initialize an empty DataFrame to store processed data
processed_data = pd.DataFrame(columns=['Age Group', 'Investment Time Horizon', 'Risk Type', 'Allocation'])

# Iterate over each age group and risk type to populate the DataFrame
for age_group, indices in age_group_info.items():
    for i, risk_type in enumerate(risk_types):
        col_name = data.columns[indices[i]]  # Get the column name for the risk type
        temp_df = pd.DataFrame({
            'Age Group': age_group,
            'Investment Time Horizon': data['Aggressive_Investment Time Horizon'],
            'Risk Type': risk_type,
            'Allocation': data[col_name]
        })
        processed_data = pd.concat([processed_data, temp_df], ignore_index=True)

# Save the processed data to a CSV file
processed_data.to_csv('processed_data.csv', index=False)

print("Processed data saved successfully.")


Processed data saved successfully.


  processed_data = pd.concat([processed_data, temp_df], ignore_index=True)


In [30]:
import requests

def fetch_financial_news(api_key):
    url = 'https://newsapi.org/v2/everything'
    parameters = {
        'q': 'NASDAQ OR "Dow Jones" OR "S&P 500"',  # Using OR to combine queries
        'domains': 'bloomberg.com,ft.com,cnbc.com',  # Focusing on specific financial news domains
        'language': 'en',
        'sortBy': 'publishedAt',
        'apiKey': api_key
    }

    response = requests.get(url, params=parameters)
    data = response.json()

    if response.status_code == 200:
        return data['articles']
    else:
        return "Failed to fetch news: " + data.get('message', 'No error message provided')

# Usage
api_key = 'c27c7825d1f34093804ceab847772901'
news_articles = fetch_financial_news(api_key)
for article in news_articles:
    print(article['title'], article['url'])


CNBC Daily Open: Muted CPI gives markets best day since November https://www.cnbc.com/2025/01/16/cnbc-daily-open-muted-cpi-gives-markets-best-day-since-november.html
Goldman Sachs CEO Solomon says IPO market is 'going to pick up' along with dealmaking https://www.cnbc.com/2025/01/15/goldman-sachs-ceo-solomon-says-ipo-market-is-going-to-pick-up.html
CNBC Daily Open: Markets celebrate cool CPI, and bank earnings https://www.cnbc.com/2025/01/16/cnbc-daily-open-markets-celebrate-cool-cpi-and-bank-earnings.html
Jim Cramer says to stay focused on earnings reports after Wednesday's stock rally https://www.cnbc.com/2025/01/15/jim-cramer-earnings-reports-fed-rate.html
10-year Treasury yield pulls back aggressively after core inflation is light in December https://www.cnbc.com/2025/01/15/treasury-yields-consumer-price-index.html
Stocks surge after encouraging inflation report... https://www.cnbc.com/2025/01/14/stock-market-today-live-updates.html
Core inflation rate slows to 3.2% in December, le

In [27]:
import pandas as pd

# Load the company data from CSV
df_companies = pd.read_csv('/Users/ronitgandhi/Desktop_windows/Full Stack ML Project/nasdaq_stocks.csv')

# Ensure the DataFrame contains the expected 'Name' and 'Symbol' columns.
# Make sure to replace 'Name' and 'Symbol' with the actual column names in your CSV if they differ.

# Convert all entries to strings and remove any NaN values. 
# This step is critical because NaN values cannot be converted to strings and will cause errors during operations like API calls.
companies = df_companies['Name'].dropna().unique().tolist()
symbols = df_companies['Symbol'].dropna().unique().tolist()

# Optionally, convert to string explicitly if there's any concern about non-string data.
companies = [str(company) for company in companies]
symbols = [str(symbol) for symbol in symbols]

# Create a dictionary mapping company names to symbols if needed for reference.
company_dict = dict(zip(companies, symbols))

print(company_dict.keys())


dict_keys(['Agilent Technologies Inc.', 'Alcoa Corporation Common', 'ATA Creativity Global', 'Ares Acquisition Corporation', 'Aadi Bioscience Inc.', 'American Airlines Group', 'AA Mission Acquisition', 'Atlantic American Corporation', 'Acadian Asset Management', 'Applied Optoelectronics Inc.', 'AAON Inc. Common', 'Advance Auto Parts', 'Apple Inc. Common', 'American Assets Trust', 'AllianceBernstein Holding L.P.', 'American Battery Technology', 'AbbVie Inc. Common', 'Ameris Bancorp Common', 'AbCellera Biologics Inc.', 'Abeona Therapeutics Inc.', 'Ambev S.A. American', 'Asbury Automotive Group', 'Abacus Life Inc.', 'Able View Global', 'ABM Industries Incorporated', 'Airbnb Inc. Class', 'Acumen Pharmaceuticals Inc.', 'Abpro Holdings Inc', 'Abpro Holdings Inc.', 'Arbor Realty Trust', 'Absci Corporation Common', 'Abbott Laboratories Common', 'Abits Group Inc', 'Arbutus Biopharma Corporation', 'ABVC BioPharma Inc.', 'Above Food Ingredients', 'Abivax SA American', 'Associated Capital Group', 

## Sentiment Logic

## Using logic of small mid and large cap stocks

In [102]:
import pandas as pd

# Load data
df = pd.read_csv('/Users/ronitgandhi/Desktop_windows/Full Stack ML Project/nasdaq_stocks_2.csv')

# Function to convert market cap from string to float


# Apply conversion to the MarketCap column
df['Market Cap'] = df['Market Cap']

# Define cap categories based on market cap
def categorize_cap(market_cap):
    if market_cap < 50000000:
        return 'Nano-cap'
    elif market_cap < 300000000:
        return 'Micro-cap'
    elif market_cap < 2000000000:
        return 'Small-cap'
    elif market_cap < 10000000000:
        return 'Mid-cap'
    elif market_cap < 200000000000:
        return 'Large-cap'
    else:
        return 'Mega-cap'

# Apply categorization
df['CapCategory'] = df['Market Cap'].apply(categorize_cap)

# Define investment recommendations
def recommend(cap_category):
    if cap_category == 'Nano-cap' or cap_category == 'Micro-cap':
        return 'Very High Risk, Potential for High Returns'
    elif cap_category == 'Small-cap':
        return 'High Risk, High Potential Return'
    elif cap_category == 'Mid-cap':
        return 'Moderate Risk, Good Growth Potential'
    elif cap_category == 'Large-cap':
        return 'Lower Risk, Stable Returns'
    else:  # Mega-cap
        return 'Lowest Risk, Strong Stable Returns'

# Apply recommendation logic
df['InvestmentRecommendation'] = df['CapCategory'].apply(recommend)
df.to_csv('/Users/ronitgandhi/Desktop_windows/Full Stack ML Project/nasdaq_stocks_2.csv', index=False)

# Display the modified DataFrame
print(df[['Symbol', 'Name', 'Market Cap', 'CapCategory', 'InvestmentRecommendation']].head())


  Symbol                  Name    Market Cap CapCategory  \
0      A  Agilent Technologies  4.133135e+10   Large-cap   
1     AA     Alcoa Corporation  1.005259e+10   Large-cap   
2   AACG        ATA Creativity  3.168013e+07    Nano-cap   
3   AACT      Ares Acquisition  0.000000e+00    Nano-cap   
4   AADI       Aadi Bioscience  7.714634e+07   Micro-cap   

                     InvestmentRecommendation  
0                  Lower Risk, Stable Returns  
1                  Lower Risk, Stable Returns  
2  Very High Risk, Potential for High Returns  
3  Very High Risk, Potential for High Returns  
4  Very High Risk, Potential for High Returns  


In [104]:
def assign_allocation(cap_category):
    if cap_category in ['Nano-cap', 'Micro-cap']:
        return 'Equity – Agg'  # Aggressive
    elif cap_category == 'Small-cap':
        return 'Equity – Agg'  # Moderate
    elif cap_category == 'Mid-cap':
        return 'Equity – Mod'  # Balanced
    elif cap_category in ['Large-cap', 'Mega-cap']:
        return 'Equity – Mod'  # Could also be conservative if that's an option in your strategies

# Applying the function to create a new column
df['AssetAllocation'] = df['CapCategory'].apply(assign_allocation)

# Save the DataFrame back to the CSV if needed
df.to_csv('/Users/ronitgandhi/Desktop_windows/Full Stack ML Project/nasdaq_stocks_2.csv', index=False)

# Show the updated DataFrame
print(df[['Symbol', 'Name', 'Market Cap', 'CapCategory', 'AssetAllocation']].head())


  Symbol                  Name    Market Cap CapCategory AssetAllocation
0      A  Agilent Technologies  4.133135e+10   Large-cap    Equity – Mod
1     AA     Alcoa Corporation  1.005259e+10   Large-cap    Equity – Mod
2   AACG        ATA Creativity  3.168013e+07    Nano-cap    Equity – Agg
3   AACT      Ares Acquisition  0.000000e+00    Nano-cap    Equity – Agg
4   AADI       Aadi Bioscience  7.714634e+07   Micro-cap    Equity – Agg


In [98]:
import pandas as pd

# Load the CSV file
file_path = '/Users/ronitgandhi/Desktop_windows/Full Stack ML Project/nasdaq_stocks_2.csv'  # Update this path to your file location
df = pd.read_csv(file_path)

# Check the top rows to understand the structure, especially the company name column
print(df.head())

# Assuming the column with company names is called 'CompanyName'
def truncate_name(name):
    # Split the name by spaces and join the first three words
    words = name.split()
    return ' '.join(words[:2])  # Take only the first three words

def remove_common(name):
    words = name.replace("Common","")
    words = words.replace("Inc.","")
    return words
# Apply the function to the 'CompanyName' column
df['Name'] = df['Name'].apply(truncate_name)
df['Name']  =df ['Name'].apply(remove_common)

# Save the modified DataFrame back to CSV
df.to_csv(file_path, index=False)  # Set index=False to avoid adding an unwanted index column in your CSV

# Print the updated DataFrame to confirm changes
print(df.head())


  Symbol                                               Name Last Sale  \
0      A             Agilent Technologies Inc. Common Stock   $144.72   
1     AA                    Alcoa Corporation Common Stock     $38.91   
2   AACG   ATA Creativity Global American Depositary Shares     $0.99   
3   AACT  Ares Acquisition Corporation II Class A Ordina...    $10.98   
4   AADI                  Aadi Bioscience Inc. Common Stock     $3.13   

   Net Change % Change    Market Cap        Country  IPO Year   Volume  \
0        2.49   1.751%  4.133135e+10  United States    1999.0  1661424   
1        0.72   1.885%  1.005259e+10  United States    2016.0  4881004   
2       -0.04  -3.883%  3.168013e+07          China    2008.0    11111   
3       -0.02  -0.182%  0.000000e+00            NaN    2023.0   295800   
4       -0.11  -3.395%  7.714634e+07  United States       NaN   182073   

        Sector                                          Industry  
0  Industrials  Biotechnology: Laboratory Analyti

In [108]:
import pandas as pd

# Load the company data from CSV
df_companies = pd.read_csv('/Users/ronitgandhi/Desktop_windows/Full Stack ML Project/nasdaq_stocks_2.csv')

# Ensure the DataFrame contains the expected 'Name' and 'Symbol' columns.
# Make sure to replace 'Name' and 'Symbol' with the actual column names in your CSV if they differ.

# Convert all entries to strings and remove any NaN values. 
# This step is critical because NaN values cannot be converted to strings and will cause errors during operations like API calls.
companies = df_companies['Name'].dropna().unique().tolist()
symbols = df_companies['Symbol'].dropna().unique().tolist()

# Optionally, convert to string explicitly if there's any concern about non-string data.
companies = [str(company) for company in companies]
symbols = [str(symbol) for symbol in symbols]

# Create a dictionary mapping company names to symbols if needed for reference.
company_dict = dict(zip(companies, symbols))


data_to_check = pd.read_csv('/Users/ronitgandhi/Desktop_windows/Full Stack ML Project/yahoo-stock-news (2).csv')

def find_company_in_description(description):
    # Normalize the description to lowercase and tokenize by splitting
    description_tokens = set(description.lower().split())
    matched_companies = []
    
    # Iterate over each company name in the dictionary
    for company in company_dict.keys():
        # Split the company name into parts and normalize to lowercase
        parts = company.lower().split()
        # Check if all parts of the company name are present in the description
        if all(part in description_tokens for part in parts):
            matched_companies.append(company)
    
    # Return a comma-separated string of matched company names, or NaN if none found
    return ', '.join(matched_companies) if matched_companies else pd.NA

# Apply the function to the Description column
data_to_check['Matched Companies'] = data_to_check['Description'].apply(find_company_in_description)

# Print some results to verify
print(data_to_check[['Description', 'Matched Companies']])

data_to_check.to_csv('/Users/ronitgandhi/Desktop_windows/Full Stack ML Project/updated_yahoo_stock_news.csv', index=False)



                                           Description    Matched Companies
0    Find insight on Ford, Maersk, EasyJet, Schaeff...                 <NA>
1    Find insight on Capital One, Travelers, Binanc...                 <NA>
2    Whatever doubts investors may have surrounding...  Fuel Tech, Netflix 
3    (Bloomberg) -- Saudi Arabia’s bond spree conti...                 <NA>
4    (Bloomberg) -- A rally in big tech fueled by o...                 <NA>
..                                                 ...                  ...
134  US stocks closed strong after Trump took offic...                 Dow 
135  Moderna received a $590 million award from the...             Moderna 
136  The stablecoin giant is facing off against Bla...           BlackRock 
137  Tesla stock was in the red on the first tradin...     First of, Tesla 
138  Apple shares fell Tuesday after Jefferies down...               Apple 

[139 rows x 2 columns]


In [None]:
Alpha vantage api key: 6TUBVQA7MI2Y0C8K

## ETF preprocessor

In [1]:
import pandas as pd

# Assuming the data is stored in a CSV file named 'etfs.csv'
df = pd.read_csv('/Users/ronitgandhi/Desktop_windows/Full Stack ML Project/etf-scraper.csv')


In [25]:
import pandas as pd
import numpy as np

# Load the data
df = pd.read_csv('/Users/ronitgandhi/Desktop_windows/Full Stack ML Project/etf-scraper.csv')

# Inspect the data for empty strings or spaces in the 'Assets' column
print(df['Assets'].unique())

# Replace empty strings or spaces with NaN
df['Assets'] = df['Assets'].replace(r'^\s*$', np.nan, regex=True)

# Option 1: Drop rows where 'Assets' is NaN
df.dropna(subset=['Assets'], inplace=True)

['32.59M' '939.85M' '25.96M' ... '148.79M' '148.99M' '22.68M']


In [31]:
def convert_assets(asset_str):
    try:
        if asset_str.endswith('M'):
            return float(asset_str.rstrip('M')) * 1e6
        elif asset_str.endswith('B'):
            return float(asset_str.rstrip('B')) * 1e9
        elif asset_str.endswith('K'):
            return float(asset_str.rstrip('K')) * 1e3
        return float(asset_str)  # Assuming direct float conversion possible otherwise
    except ValueError:
        return np.nan  # Return NaN for any value that can't be converted to float

# Apply the conversion function safely
df['Assets'] = df['Assets'].apply(convert_assets)

# Check if there are any NaN values after conversion
print("NaNs after conversion:", df['Assets'].isna().sum())



NaNs after conversion: 22


In [33]:
print(df[['Symbol', 'Name', 'Type', 'Assets']].head())


  Symbol                                            Name          Type  \
0    AAA  Alternative Access First Priority CLO Bond ETF  Fixed Income   
1   AAAU                 Goldman Sachs Physical Gold ETF     Commodity   
2   AADR             AdvisorShares Dorsey Wright ADR ETF        Equity   
3   AAPB            GraniteShares 2x Long AAPL Daily ETF        Equity   
4   AAPD          Direxion Daily AAPL Bear 1X Shares ETF        Equity   

        Assets  
0   32590000.0  
1  939850000.0  
2   25960000.0  
3   22050000.0  
4   30890000.0  


In [39]:
def categorize_by_asset_size(assets):
    if assets < 50000000:
        return 'Nano-cap'
    elif assets < 300000000:
        return 'Micro-cap'
    elif assets < 2000000000:
        return 'Small-cap'
    elif assets < 10000000000:
        return 'Mid-cap'
    elif assets < 200000000000:
        return 'Large-cap'
    else:
        return 'Mega-cap'

df['Cap Size'] = df['Assets'].apply(categorize_by_asset_size)


In [41]:
# Verify the final output
print(df[['Symbol', 'Name', 'Type', 'Assets', 'Cap Size']].head())

# Optionally, export to a new CSV
df.to_csv('final_categorized_etfs.csv', index=False)


  Symbol                                            Name          Type  \
0    AAA  Alternative Access First Priority CLO Bond ETF  Fixed Income   
1   AAAU                 Goldman Sachs Physical Gold ETF     Commodity   
2   AADR             AdvisorShares Dorsey Wright ADR ETF        Equity   
3   AAPB            GraniteShares 2x Long AAPL Daily ETF        Equity   
4   AAPD          Direxion Daily AAPL Bear 1X Shares ETF        Equity   

        Assets   Cap Size  
0   32590000.0   Nano-cap  
1  939850000.0  Small-cap  
2   25960000.0   Nano-cap  
3   22050000.0   Nano-cap  
4   30890000.0   Nano-cap  


In [45]:
print(df['Type'].unique())
print(df['Cap Size'].unique())


['Fixed Income' 'Commodity' 'Equity' 'Asset Allocation' 'Alternatives'
 'Currency' '-']
['Nano-cap' 'Small-cap' 'Micro-cap' 'Mid-cap' 'Large-cap' 'Mega-cap']


In [47]:
risk_mapping = {
    ('Fixed Income', 'Nano-cap'): 'Income',
    ('Fixed Income', 'Small-cap'): 'Income',
    ('Fixed Income', 'Micro-cap'): 'Income',
    ('Fixed Income', 'Mid-cap'): 'Income',
    ('Fixed Income', 'Large-cap'): 'Income',
    ('Fixed Income', 'Mega-cap'): 'Income',
    ('Commodity', 'Nano-cap'): 'Short Term',
    ('Commodity', 'Small-cap'): 'Short Term',
    ('Commodity', 'Micro-cap'): 'Short Term',
    ('Commodity', 'Mid-cap'): 'Short Term',
    ('Commodity', 'Large-cap'): 'Short Term',
    ('Commodity', 'Mega-cap'): 'Short Term',
    ('Equity', 'Nano-cap'): 'Equity – Agg',
    ('Equity', 'Small-cap'): 'Equity – Agg',
    ('Equity', 'Micro-cap'): 'Equity – Mod',
    ('Equity', 'Mid-cap'): 'Equity – Mod',
    ('Equity', 'Large-cap'): 'Balanced',
    ('Equity', 'Mega-cap'): 'Balanced',
    ('Asset Allocation', 'Nano-cap'): 'Balanced',
    ('Asset Allocation', 'Small-cap'): 'Balanced',
    ('Asset Allocation', 'Micro-cap'): 'Balanced',
    ('Asset Allocation', 'Mid-cap'): 'Balanced',
    ('Asset Allocation', 'Large-cap'): 'Balanced',
    ('Asset Allocation', 'Mega-cap'): 'Balanced',
    ('Alternatives', 'Nano-cap'): 'Liquid',
    ('Alternatives', 'Small-cap'): 'Liquid',
    ('Alternatives', 'Micro-cap'): 'Liquid',
    ('Alternatives', 'Mid-cap'): 'Liquid',
    ('Alternatives', 'Large-cap'): 'Liquid',
    ('Alternatives', 'Mega-cap'): 'Liquid',
    ('Currency', 'Nano-cap'): 'Liquid',
    ('Currency', 'Small-cap'): 'Liquid',
    ('Currency', 'Micro-cap'): 'Liquid',
    ('Currency', 'Mid-cap'): 'Liquid',
    ('Currency', 'Large-cap'): 'Liquid',
    ('Currency', 'Mega-cap'): 'Liquid'
}


In [49]:
def assign_risk_type(row):
    # Retrieve the asset type and cap size from the row
    asset_type = row['Type']
    cap_size = row['Cap Size']
    # Use the mapping to find the corresponding risk type
    return risk_mapping.get((asset_type, cap_size), 'Unknown')  # Default to 'Unknown' if no match

# Apply the function to each row in the DataFrame
df['Risk Type'] = df.apply(assign_risk_type, axis=1)


In [53]:
# Check the distribution of risk types
print(df['Risk Type'].value_counts())

# Optionally, view some rows to ensure the mapping is correct
print(df[['Symbol', 'Name', 'Type', 'Cap Size', 'Risk Type']].head())

# Save the categorized DataFrame to a CSV file
df.to_csv('final_categorized_etfs_with_risk.csv', index=False)


Risk Type
Equity – Agg    1655
Equity – Mod    1035
Income           823
Balanced         278
Liquid           124
Short Term       102
Unknown           10
Name: count, dtype: int64
  Symbol                                            Name          Type  \
0    AAA  Alternative Access First Priority CLO Bond ETF  Fixed Income   
1   AAAU                 Goldman Sachs Physical Gold ETF     Commodity   
2   AADR             AdvisorShares Dorsey Wright ADR ETF        Equity   
3   AAPB            GraniteShares 2x Long AAPL Daily ETF        Equity   
4   AAPD          Direxion Daily AAPL Bear 1X Shares ETF        Equity   

    Cap Size     Risk Type  
0   Nano-cap        Income  
1  Small-cap    Short Term  
2   Nano-cap  Equity – Agg  
3   Nano-cap  Equity – Agg  
4   Nano-cap  Equity – Agg  
