<a href="https://colab.research.google.com/github/Risskr/Stock-App/blob/Production_v2/%20Nightly_Data_Pull.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Section 1: Set Up**

In [None]:
#------Imports--------#
import pandas as pd
import datetime
import pickle
import numpy as np
import requests
import time
import os

#-------Froms-------#
from tqdm.notebook import tqdm
from datetime import datetime, timedelta

In [None]:
# Google Cloud Setup

import functions_framework # Required for all Cloud Functions
import pandas as pd      # For creating and manipulating DataFrames
from google.cloud import storage # To interact with Google Cloud Storage
import os                # To read environment variables
import datetime          # To add a timestamp to our file name

# --- CONFIGURATION (IMPORTANT: EDIT THIS LINE!) ---
# Replace 'YOUR_ACTUAL_GCS_BUCKET_NAME' with the name of your GCS bucket.
# Example: 'chris-financial-data-bucket-123'
GCS_BUCKET_NAME = os.environ.get('GCS_BUCKET_NAME', 'solar_system_bucket')
# The prefix for your data file. A timestamp will be added to it.
GCS_FILE_PREFIX = os.environ.get('GCS_FILE_PREFIX', 'solar_system_')
GCS_FILE_EXTENSION = ".csv" # We are now using CSV!
# --- SAVE DATAFRAME TO GOOGLE CLOUD STORAGE AS CSV ---
# Define the GCS bucket name and the desired file name in the bucket
# These values will come from the environment variables we set during deployment
actual_gcs_bucket_name = os.getenv('GCS_BUCKET_NAME') # This comes from --set-env-vars
actual_gcs_file_prefix = os.getenv('GCS_FILE_PREFIX') # This comes from --set-env-vars
# --- END CONFIGURATION ---

#EODHD nasdaq_df API

In [None]:
# Ensure the last 6 months of EODHD Data is available
# Savefile
"""
Returns:
nasdaq_df:
  <class 'pandas.core.frame.DataFrame'>
  RangeIndex: 627225 entries, 0 to 627224
  Data columns (total 8 columns):
  #   Column          Non-Null Count   Dtype
  ---  ------          --------------   -----
  0   date            627225 non-null  datetime64[ns]
  1   ticker          627098 non-null  object
  2   open            627225 non-null  float64
  3   high            627225 non-null  float64
  4   low             627225 non-null  float64
  5   close           627225 non-null  float64
  6   adjusted_close  627225 non-null  float64
  7   volume          627225 non-null  float64
"""
#API_KEY = 'demo'  # Replace with your API key if not using demo
API_KEY = '68433aff09ea73.10710364'
EXCHANGE = 'NASDAQ'
DAYS_BACK = 180  # Approx. 6 months
# #OLD file Path
# SAVE_PATH_NASDAQ = '/content/drive/MyDrive/Colab Notebooks/Production/nasdaq_df.csv'
MAX_CALLS_PER_RUN = 200  # Use 1–5 for the free tier
SECONDS_BETWEEN_CALLS = 0  # Add delay to be respectful

# Generate past 6 months of weekdays
today = datetime.utcnow().date()
dates = [today - timedelta(days=i) for i in range(DAYS_BACK)]
dates = sorted([d for d in dates if d.weekday() < 5])  # Keep only weekdays

# Load already-downloaded dates if file exists
downloaded_dates = set()
if os.path.exists(SAVE_PATH_NASDAQ):
    df_existing = pd.read_csv(SAVE_PATH_NASDAQ)
    # Ensure the date column is treated as datetime objects
    df_existing['date'] = pd.to_datetime(df_existing['date']).dt.date
    downloaded_dates = set(df_existing['date'])

# Filter to only dates we haven’t downloaded
pending_dates = [d for d in dates if d not in downloaded_dates]

# Prepare data holder
all_data = []

# Ensure at least the first page of data is fetched if no data is downloaded
dates_to_fetch = pending_dates
if not dates_to_fetch and not downloaded_dates:
    dates_to_fetch = dates[:MAX_CALLS_PER_RUN]
elif len(dates_to_fetch) > MAX_CALLS_PER_RUN:
    dates_to_fetch = pending_dates[:MAX_CALLS_PER_RUN]


for i, date in enumerate(dates_to_fetch):
    date_str = date.strftime('%Y-%m-%d')
    url = f'https://eodhd.com/api/eod-bulk-last-day/{EXCHANGE}?api_token={API_KEY}&fmt=json&date={date_str}'
    print(f"[{i+1}] Fetching {date_str}...")

    try:
        response = requests.get(url)
        response.raise_for_status()
        day_data = response.json()

        for entry in day_data:
            all_data.append({
                'date': entry.get('date'),
                'ticker': entry.get('code'),
                'open': entry.get('open'),
                'high': entry.get('high'),
                'low': entry.get('low'),
                'close': entry.get('close'),
                'adjusted_close': entry.get('adjusted_close'),
                'volume': entry.get('volume'),
            })

        # Respect the delay
        if i < len(dates_to_fetch) - 1:
            time.sleep(SECONDS_BETWEEN_CALLS)

    except Exception as e:
        print(f"⚠️ Error on {date_str}: {e}")

# Append or save the new data
if all_data:
    df_new = pd.DataFrame(all_data)
    # Ensure the date column in df_new is in datetime format for merging/concatenating
    df_new['date'] = pd.to_datetime(df_new['date'])

    if os.path.exists(SAVE_PATH_NASDAQ):

        #Read CSV
        df_existing = pd.read_csv(SAVE_PATH_NASDAQ)
        df_existing['date'] = pd.to_datetime(df_existing['date'])
        df_combined = pd.concat([df_existing, df_new], ignore_index=True)
    else:
        df_combined = df_new

    #Write to CSV
    df_combined.to_csv(SAVE_PATH_NASDAQ, index=False)
    print(f"✅ Data for {len(all_data)} entries added to {SAVE_PATH_NASDAQ}")

elif downloaded_dates:
     print("⚠️ No new data fetched, but existing data found.")

else:
    print("⚠️ No new data fetched and no existing data found.")

# Read CSV
nasdaq_df = pd.read_csv(SAVE_PATH_NASDAQ)

# Save the latest date in YYYYMMDD format
nasdaq_df['date'] = pd.to_datetime(nasdaq_df['date'])
latest_date_nasdaq_data = nasdaq_df['date'].max().strftime('%Y%m%d')


#----------Save to Cloud Storage-----------
#------Change-------#
# Let's use your desired file name for the base, e.g., 'nasdaq_df'
base_file_name = "nasdaq_df" # You can make this an env var too if you want!
gcs_file_name = f"{base_file_name}_{GCS_FILE_EXTENSION}"
print(f"Preparing to save data to GCS: gs://{actual_gcs_bucket_name}/{gcs_file_name}")
# Step 1: Get a GCS client
storage_client = storage.Client()
bucket = storage_client.bucket(actual_gcs_bucket_name)
blob = bucket.blob(gcs_file_name) # Define the "file" (blob) in your bucket
#------Change--------#
# Step 2: Convert your DataFrame to a CSV string in memory
# This is key! We're not writing to a local file, but to a string variable.
csv_string = nasdaq_df.to_csv(index=False) # index=False is good, as you had it!
# Step 3: Upload the CSV string to GCS
blob.upload_from_string(csv_string, content_type='text/csv')
print(f"Successfully saved data to gs://{actual_gcs_bucket_name}/{gcs_file_name}")
#----END File save-------

⚠️ No new data fetched, but existing data found.


# EODHD screener_data_df API

In [None]:
# Get list of Tickers with a min market cap and a list of common stocks
# Savefile
"""
Returns:
screener_data_df:
  <class 'pandas.core.frame.DataFrame'>
  RangeIndex: 262 entries, 0 to 261
  Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype
  ---  ------                 --------------  -----
 0   code                   262 non-null    object
 1   name                   262 non-null    object
 2   last_day_data_date     262 non-null    object
 3   adjusted_close         262 non-null    float64
 4   refund_1d              262 non-null    float64
 5   refund_1d_p              262 non-null    float64
 6   refund_5d              262 non-null    float64
 7   refund_5d_p            262 non-null    float64
 8   exchange               262 non-null    object
 9   currency_symbol        262 non-null    object
 10  market_capitalization  262 non-null    int64
 11  earnings_share         262 non-null    float64
 12  dividend_yield         145 non-null    float64
 13  sector                 262 non-null    object
 14  industry               262 non-null    object
 15  avgvol_1d              262 non-null    int64
 16  avgvol_200d            262 non-null    float64
 17  Country                262 non-null    object
 18  Exchange               262 non-null    object
 19  Currency               262 non-null    object
 20  Type                   262 non-null    object
 21  last_day_change        262 non-null    float64

"""

# -------------------- CONFIG --------------------
API_KEY = '68433aff09ea73.10710364'  # Replace with your EODHD key
MIN_MARKET_CAP = 10_000_000_000  # Changeable: $1B, $10B, etc.
EXCHANGE = 'NASDAQ'
RESULTS_PER_PAGE = 500  # Max per EODHD API
SAVE_PATH_SCREENER = '/content/drive/MyDrive/Colab Notebooks/Production/screener_data_df.csv'
# ------------------------------------------------

def get_filtered_nasdaq_stocks(api_key, min_cap, exchange="NASDAQ"):
    all_data = []
    offset = 0

    while True:
        url = (
            "https://eodhd.com/api/screener"
            f"?api_token={api_key}"
            f"&filters=["
            f'["exchange","=","{exchange}"],'
            #f'["type","=","Common Stock"],'
            f'["market_capitalization",">=",{min_cap}]'
            f"]"
            f"&sort=market_capitalization.desc"
            f"&limit={RESULTS_PER_PAGE}&offset={offset}&fmt=json"
        )

        response = requests.get(url)
        result = response.json()
        batch = result.get("data", [])

        if not batch:
            break

        all_data.extend(batch)
        offset += RESULTS_PER_PAGE

    return pd.DataFrame(all_data)

# Run filter to get screener_data_df
screener_data_df = get_filtered_nasdaq_stocks(API_KEY, MIN_MARKET_CAP)

# Get exchange symbol list with type
meta_url = f'https://eodhd.com/api/exchange-symbol-list/NASDAQ?api_token={API_KEY}&fmt=json'
meta_df = pd.DataFrame(requests.get(meta_url).json())
common_df = meta_df[meta_df['Type'] == 'Common Stock'].copy() # Add .copy() to avoid SettingWithCopyWarning

# Combine screener_data_df with relevant columns from common_df
screener_data_df = pd.merge(
    screener_data_df,
    common_df[['Code', 'Country', 'Exchange', 'Currency', 'Type']],
    left_on='code',
    right_on='Code',
    how='inner'
).drop('Code', axis=1)

# --- Calculate Daily Change ---
# Ensure 'date' column is datetime and sort by date
nasdaq_df['date'] = pd.to_datetime(nasdaq_df['date'])
temp_nasdaq_sorted = nasdaq_df.sort_values(by=['ticker', 'date']).copy()

# Calculate the previous day's adjusted close for each ticker
temp_nasdaq_sorted['prev_adjusted_close'] = temp_nasdaq_sorted.groupby('ticker')['adjusted_close'].shift(1)

# Calculate the daily change as a percentage
temp_nasdaq_sorted['daily_change'] = ((temp_nasdaq_sorted['adjusted_close'] - temp_nasdaq_sorted['prev_adjusted_close']) / temp_nasdaq_sorted['prev_adjusted_close']) * 100

# Get the last day's data for each ticker
last_day_data_per_ticker = temp_nasdaq_sorted.groupby('ticker').tail(1).copy()

# Create a dictionary of the last day's change for each ticker
last_day_changes = last_day_data_per_ticker.set_index('ticker')['daily_change'].to_dict()

# Add the last day change to the screener_data_df
screener_data_df['last_day_change'] = screener_data_df['code'].map(last_day_changes)

# # Save screener data to csv
# screener_data_df.to_csv(SAVE_PATH_SCREENER, index=False)

#----------Save to Cloud Storage-----------
#------Change-------#
# Let's use your desired file name for the base, e.g., 'nasdaq_df'
base_file_name = "screener_data_df" # You can make this an env var too if you want!
gcs_file_name = f"{base_file_name}_{GCS_FILE_EXTENSION}"
print(f"Preparing to save data to GCS: gs://{actual_gcs_bucket_name}/{gcs_file_name}")
# Step 1: Get a GCS client
storage_client = storage.Client()
bucket = storage_client.bucket(actual_gcs_bucket_name)
blob = bucket.blob(gcs_file_name) # Define the "file" (blob) in your bucket
#------Change--------#
# Step 2: Convert your DataFrame to a CSV string in memory
# This is key! We're not writing to a local file, but to a string variable.
csv_string = screener_data_df.to_csv(index=False) # index=False is good, as you had it!
# Step 3: Upload the CSV string to GCS
blob.upload_from_string(csv_string, content_type='text/csv')
print(f"Successfully saved data to gs://{actual_gcs_bucket_name}/{gcs_file_name}")
#----END File save-----

## Filter nasdaq data for screener_data_df and type: common

In [None]:
# prompt: I want to filter the Nasdaq stocks to only include tickers that are part of the common_df and the min_market_cap_df. Same this df as a new variable
# # Old Save File
# SAVE_PATH_FILTERED_NASDAQ = '/content/drive/MyDrive/Colab Notebooks/Production/filtered_nasdaq_df.csv'

# Load your EOD data
from datetime import datetime, timedelta
# nasdaq_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/nasdaq_bulk_eod.csv")
nasdaq_df['date'] = pd.to_datetime(nasdaq_df['date'])

# Now filter nasdaq_df using the combined dataframe and type 'Common Stock'
filtered_nasdaq_df = nasdaq_df[
    nasdaq_df['ticker'].isin(screener_data_df[screener_data_df['Type'] == 'Common Stock']['code'])
]

# Save filtered nasdaq data to csv
filtered_nasdaq_df.to_csv(SAVE_PATH_FILTERED_NASDAQ, index=False)

#----------Save to Cloud Storage-----------
# --- SAVE DATAFRAME TO GOOGLE CLOUD STORAGE AS CSV ---
# Define the GCS bucket name and the desired file name in the bucket
# These values will come from the environment variables we set during deployment
actual_gcs_bucket_name = os.getenv('GCS_BUCKET_NAME') # This comes from --set-env-vars
actual_gcs_file_prefix = os.getenv('GCS_FILE_PREFIX') # This comes from --set-env-vars

#------Change-------#
# Let's use your desired file name for the base, e.g., 'nasdaq_df'
base_file_name = "filtered_nasdaq_df" # You can make this an env var too if you want!
gcs_file_name = f"{base_file_name}_{GCS_FILE_EXTENSION}"


print(f"Preparing to save data to GCS: gs://{actual_gcs_bucket_name}/{gcs_file_name}")

# Step 1: Get a GCS client
storage_client = storage.Client()
bucket = storage_client.bucket(actual_gcs_bucket_name)
blob = bucket.blob(gcs_file_name) # Define the "file" (blob) in your bucket

#------Change--------#
# Step 2: Convert your DataFrame to a CSV string in memory
# This is key! We're not writing to a local file, but to a string variable.
csv_string = filtered_nasdaq_df.to_csv(index=False) # index=False is good, as you had it!

# Step 3: Upload the CSV string to GCS
blob.upload_from_string(csv_string, content_type='text/csv')

print(f"Successfully saved data to gs://{actual_gcs_bucket_name}/{gcs_file_name}")

# Function: Correlation Coeficient for entire stock data set

In [None]:
#return six month and three month spearman correlations for all unique pairs of stocks

import pandas as pd
from datetime import datetime # Import datetime
from tqdm.notebook import tqdm # Import tqdm

# ## Function: Correlation Coeficient for entire stock data set
def calculate_lagged_correlation(df, lag_days, range_months):
  """
  Calculates the pairwise spearman correlation coefficient between all stocks
  in a DataFrame for a specified period with a given lag.
  The start date is calculated by subtracting range_months from today's date.

  Args:
    df: DataFrame with 'date', 'ticker', and 'adjusted_close' columns.
    lag_days: The number of days to lag the second stock's data.
    range_months: The number of months to include in the analysis period.

  Returns:
    correlation_matrix: A pandas DataFrame with the following
      Index: Tickers
      Columns: Tickers
      Values: Spearman correlation coefficients between stocks
  """
  # Calculate the end date (today's date)
  end_datetime = datetime.now()

  # Calculate the start date for the specified period by subtracting range_months
  start_datetime = end_datetime - pd.DateOffset(months = range_months)


  # Ensure the 'date' column is in datetime format
  df['date'] = pd.to_datetime(df['date'])

  # Filter the DataFrame for the specified date range
  filtered_df = df[(df['date'] >= start_datetime) & (df['date'] <= end_datetime)].copy()

  # Filter out rows where volume is 0
  filtered_df = filtered_df[filtered_df['volume'] > 0].copy()

  # Get unique tickers in the filtered data
  tickers = filtered_df['ticker'].unique()

  #breakpoint()

  # Create an empty DataFrame to store correlation results
  correlation_matrix = pd.DataFrame(index=tickers, columns=tickers, dtype=float)

  # Iterate through all pairs of tickers with a progress bar
  for ticker_a in tqdm(tickers, desc="Calculating correlations"): # Add tqdm here

    # Extract data for each ticker and name the series for clarity
    stock_a_data = filtered_df[filtered_df['ticker'] == ticker_a].set_index('date')['adjusted_close']
    #breakpoint()

    for ticker_b in tickers:
      if ticker_a != ticker_b:
        # Extract data for each ticker and name the series for clarity
        stock_b_data = filtered_df[filtered_df['ticker'] == ticker_b].set_index('date')['adjusted_close']
        #breakpoint()

        # Align the dataframes based on the date index
        # Suffixes will be applied to the 'adjusted_close' column name
        aligned_data = pd.merge(stock_a_data, stock_b_data,
                                left_index=True, right_index=True,
                                how='inner', suffixes=('_A', '_B'))

        # Apply the lag to stock_b_data, referencing the suffixed column name
        lagged_stock_b_data = aligned_data['adjusted_close_B'].shift(lag_days)

        #breakpoint()

        # Calculate correlation, dropping NaN values
        # Using Spearman method and minimum periods
        #correlation = aligned_data['adjusted_close_A'].corr(lagged_stock_b_data, method='spearman', min_periods=100)

        correlation = aligned_data['adjusted_close_A'].corr(lagged_stock_b_data, method='spearman')


        # Store the correlation in the matrix
        correlation_matrix.loc[ticker_a, ticker_b] = correlation

  return correlation_matrix


##Run Correlation Function

In [None]:
# If the nasdaq file was updated, then run the correlation function
# Save File
"""
Returns:
  three_month_spearman_lagged_correlations and six_month_spearman_lagged_correlations: A pandas DataFrame with the following
    Index: Tickers
    Columns: Tickers
    Values: Spearman correlation coefficients between stocks
"""
import pandas as pd
import os
from datetime import datetime, timedelta

# Define the file paths in Google Drive using the determined date
three_month_file = f'/content/drive/MyDrive/Colab Notebooks/Production/three_month_spearman_lagged_correlation.csv'
six_month_file = f'/content/drive/MyDrive/Colab Notebooks/Production/six_month_spearman_lagged_correlation.csv'


#----------Save to Cloud Storage-----------
# --- SAVE DATAFRAME TO GOOGLE CLOUD STORAGE AS CSV ---
# Define the GCS bucket name and the desired file name in the bucket
# These values will come from the environment variables we set during deployment
actual_gcs_bucket_name = os.getenv('GCS_BUCKET_NAME') # This comes from --set-env-vars

#------Change-------#
# Let's use your desired file name for the base, e.g., 'nasdaq_df'
base_file_name_three = "three_month_spearman_lagged_correlation" # You can make this an env var too if you want!
base_file_name_six = "six_month_spearman_lagged_correlation" # You can make this an env var too if you want!

gcs_file_name_three = f"{base_file_name_three}_{GCS_FILE_EXTENSION}"
gcs_file_name_six = f"{base_file_name_six}_{GCS_FILE_EXTENSION}"

print(f"Preparing to save data to GCS: gs://{actual_gcs_bucket_name}/{gcs_file_name_three}")
print(f"Preparing to save data to GCS: gs://{actual_gcs_bucket_name}/{gcs_file_name_six}")

# Step 1: Get a GCS client
storage_client = storage.Client()
bucket = storage_client.bucket(actual_gcs_bucket_name)
blob_three = bucket.blob(gcs_file_name_three) # Define the "file" (blob) in your bucket
blob_six = bucket.blob(gcs_file_name_six) # Define the "file" (blob) in your bucket


# Check if new data was fetched in the previous step
if all_data:
    print("New data fetched. Calculating correlations...")
    # Calculate correlations if files don't exist
    # Ensure 'filtered_nasdaq_df' is defined from the preceding code
    if 'filtered_nasdaq_df' in locals():
        three_month_spearman_lagged_correlations = calculate_lagged_correlation(filtered_nasdaq_df, lag_days=1, range_months=3)
        six_month_spearman_lagged_correlations = calculate_lagged_correlation(filtered_nasdaq_df, lag_days=1, range_months=6)


        # #OLD Save files
        # three_month_spearman_lagged_correlations.to_csv(three_month_file)
        # six_month_spearman_lagged_correlations.to_csv(six_month_file)

        #------Change--------#
        # Step 2: Convert your DataFrame to a CSV string in memory
        # This is key! We're not writing to a local file, but to a string variable.
        csv_string_three = three_month_spearman_lagged_correlations.to_csv(index=False) # index=False is good, as you had it!
        csv_string_six = six_month_spearman_lagged_correlations.to_csv(index=False) # index=False is good, as you had it!

        # Step 3: Upload the CSV string to GCS
        blob.upload_from_string(csv_string_three, content_type='text/csv')
        blob.upload_from_string(csv_string_six, content_type='text/csv')
        print(f"Successfully saved data to gs://{actual_gcs_bucket_name}/{gcs_file_name_three}")
        print(f"Successfully saved data to gs://{actual_gcs_bucket_name}/{gcs_file_name_six}")

    else:
        print("Error: 'filtered_nasdaq_df' is not defined. Please ensure the preceding code ran correctly.")
else:
    # Check if files exist and load them if no new data was fetched
    try:
        three_month_spearman_lagged_correlations = pd.read_csv(three_month_file, index_col=0)
        print(f"Loaded existing file: {three_month_file}")
        six_month_spearman_lagged_correlations = pd.read_csv(six_month_file, index_col=0)
        print(f"Loaded existing file: {six_month_file}")

    except FileNotFoundError:
        print("No new data fetched and one or both correlation files not found. Cannot proceed with correlation calculation.")

Loaded existing file: /content/drive/MyDrive/Colab Notebooks/Production/three_month_spearman_lagged_correlation.csv
Loaded existing file: /content/drive/MyDrive/Colab Notebooks/Production/six_month_spearman_lagged_correlation.csv


## Process correlated Data and get gravitational scores

In [None]:
import pandas as pd
import numpy as np

# Function to process stock correlation data, calculate gravitational forces,
# and filter connections for visualization based on the force.
def process_and_score_stocks(
    six_month_correlations,
    three_month_correlations,
    screener_data_df,
    source_ticker,
    min_nodes,
    max_nodes,
    threshold_percent
):
    """
    Processes stock correlation data for a specific source ticker.
    It filters for positive correlations, computes a dynamic impact score (gravitational_force),
    filters connections, and then calculates a final net gravitational force and the
    maximum potential force under ideal conditions.

    Args:
      six_month_correlations: The six-month spearman lagged correlation matrix.
      three_month_correlations: The three-month spearman lagged correlation matrix.
      screener_data_df: DataFrame with additional stock information.
      source_ticker: The ticker symbol for which to process data.
      min_nodes: Minimum number of correlated stocks to return.
      max_nodes: Maximum number of correlated stocks to return.
      threshold_percent: A percentage (0.0 to 1.0) of the maximum force to use as a filter.

    Returns:
      processed_data_df: A pandas DataFrame with processed data for visualization.
      source_data_df: A pandas DataFrame containing the net_gravitational_force,
                      max_potential_force, and gravitational_impact for the source ticker,
                      along with the source ticker's market cap influence and source_planet_radius.
    """
    # --- Data Unpivoting and Initial Setup ---
    # Start with the 6-month correlation data as the base
    correlation_df = six_month_correlations.rename_axis('source', axis=0)
    grouped_correlation_data = correlation_df.stack().reset_index()
    grouped_correlation_data.columns = ['source', 'target', 'six_month_spearman_correlation']

    grouped_correlation_data = grouped_correlation_data[
        (grouped_correlation_data['source'] != grouped_correlation_data['target']) &
        (grouped_correlation_data['target'] != source_ticker)
    ].copy()

    # --- Filter for the specific source ticker ---
    source_connections = grouped_correlation_data[grouped_correlation_data['source'] == source_ticker].copy()
    if source_connections.empty:
        print(f"No correlation data found for source ticker {source_ticker}.")
        # Return empty dataframes when no data is found
        return pd.DataFrame(), pd.DataFrame()

    # Add 3-month correlation data before filtering
    source_connections['three_month_spearman_correlation'] = source_connections.apply(
        lambda row: three_month_correlations.loc[row['source'], row['target']] if row['source'] in three_month_correlations.index and row['target'] in three_month_correlations.columns else 0, axis=1
    )

    # We only care about positively correlated stocks for this model in both 6 and 3 month periods
    positive_corr_group = source_connections[
        (source_connections['six_month_spearman_correlation'] > 0) &
        (source_connections['three_month_spearman_correlation'] > 0)
    ].copy()

    if positive_corr_group.empty:
        print(f"No positive correlations found for source ticker {source_ticker}.")
        # Return empty dataframes when no data is found
        return pd.DataFrame(), pd.DataFrame()

    # --- Enrich Data (before filtering) ---
    # Add market data
    screener_cols_to_add = ['code', 'market_capitalization', 'last_day_change']
    required_screener_cols = ['code', 'market_capitalization', 'last_day_change']
    if not all(col in screener_data_df.columns for col in required_screener_cols):
        missing = [col for col in required_screener_cols if col not in screener_data_df.columns]
        raise ValueError(f"screener_data_df is missing required columns: {missing}")

    screener_info = screener_data_df[screener_cols_to_add].rename(columns={'code': 'target'})
    positive_corr_group = pd.merge(positive_corr_group, screener_info, on='target', how='left')
    positive_corr_group.dropna(subset=['market_capitalization', 'last_day_change'], inplace=True)
    if positive_corr_group.empty:
        print(f"No valid connections after merging screener data for {source_ticker}.")
        # Return empty dataframes when no data is found
        return pd.DataFrame(), pd.DataFrame()


    # --- Calculate Dynamic Impact Score (Gravitational Force) ---
    epsilon = 1e-9 # Small value to avoid log(0) issues.
    # Weights for recency bias
    w_3m = 0.6
    w_6m = 0.4
    # "unified_correlation" is a weighted average of recent correlations.
    positive_corr_group['unified_correlation'] = (
        w_3m * positive_corr_group['three_month_spearman_correlation'] +
        w_6m * positive_corr_group['six_month_spearman_correlation']
    )

    # Calculate a market cap influence score scaled between 0 and 1 for target stocks.
    positive_corr_group['Market Cap'] = positive_corr_group['market_capitalization']

    # --- Calculate source ticker's market cap and log cap ---
    source_screener_info = screener_data_df[screener_data_df['code'] == source_ticker]
    source_market_cap = source_screener_info['market_capitalization'].iloc[0] if not source_screener_info.empty and 'market_capitalization' in source_screener_info.columns else epsilon
    source_log_cap = np.log(max(source_market_cap, epsilon))


    # Calculate log market caps for all relevant tickers (source and targets)
    all_market_caps = positive_corr_group['Market Cap'].tolist()
    all_market_caps.append(source_market_cap) # Include source market cap

    log_caps = np.log(pd.Series(all_market_caps).clip(lower=epsilon))

    min_log_cap, max_log_cap = log_caps.min(), log_caps.max()
    log_cap_range = max_log_cap - min_log_cap

    # Calculate market cap influence for target stocks
    if log_cap_range > 0:
        positive_corr_group['market_cap_influence'] = np.log(positive_corr_group['Market Cap'].clip(lower=epsilon))
    else:
        positive_corr_group['market_cap_influence'] = 20 # Neutral value if all caps are the same


    # The `gravitational_force` is a product of recent correlation strength and market influence.
    # Modified: Increased the influence of unified_correlation by multiplying by a factor
    correlation_weight_factor = 1.0 # Factor to increase the influence of unified_correlation
    positive_corr_group['gravitational_force'] = (
        (positive_corr_group['unified_correlation'] * correlation_weight_factor) * # Multiply unified_correlation by a factor
        positive_corr_group['market_cap_influence']
    )

    # --- Apply Filtering ---
    max_abs_force = positive_corr_group['gravitational_force'].abs().max()
    if pd.isna(max_abs_force) or max_abs_force == 0:
        # Return empty dataframes when no data is found
        return pd.DataFrame(), pd.DataFrame()

    force_threshold = max_abs_force * threshold_percent
    filtered_by_force_threshold = positive_corr_group[positive_corr_group['gravitational_force'].abs() >= force_threshold].copy()

    # Enforce min/max node constraints
    if len(filtered_by_force_threshold) < min_nodes:
        final_filtered_df = positive_corr_group.sort_values(by='gravitational_force', key=abs, ascending=False).head(min_nodes).copy()
    elif len(filtered_by_force_threshold) > max_nodes:
        final_filtered_df = filtered_by_force_threshold.sort_values(by='gravitational_force', key=abs, ascending=False).head(max_nodes).copy()
    else:
        final_filtered_df = filtered_by_force_threshold.copy()

    if final_filtered_df.empty:
        print(f"No connections remained for {source_ticker} after filtering.")
        # Return empty dataframes when no data is found
        return pd.DataFrame(), pd.DataFrame()

    # --- Calculate Final Net Force and Visualization Parameters ---
    final_filtered_df['Daily Change'] = final_filtered_df['last_day_change']

    final_filtered_df['signed_gravitational_force'] = final_filtered_df.apply(
        lambda row: row['gravitational_force'] if row['Daily Change'] >= 0 else -row['gravitational_force'],
        axis=1
    )

    net_gravitational_force = final_filtered_df['signed_gravitational_force'].sum()
    max_potential_force = final_filtered_df['market_cap_influence'].sum()

    # --- Calculate Visualization Parameters ---
    min_corr, max_corr = final_filtered_df['gravitational_force'].min(), final_filtered_df['gravitational_force'].max()
    corr_range = max_corr - min_corr if max_corr > min_corr else 1.0
    # MODIFIED: Reverse the scaling for Orbital Radius
    if corr_range > 0:
        final_filtered_df['Orbital Radius'] = 1 - ((final_filtered_df['gravitational_force'] - min_corr) / corr_range)
    else:
        final_filtered_df['Orbital Radius'] = 0.5 # Neutral value if all forces are the same

    # -----Calculate Planet Radius------
    # Combine all market caps to find the true min and max for normalization
    all_caps = pd.concat([
        final_filtered_df['Market Cap'],
        pd.Series([source_market_cap]) # Make sure source_market_cap is a Series
    ], ignore_index=True)

    # Calculate the log, clipping to avoid errors with zero
    log_all_caps = np.log(all_caps.clip(lower=epsilon))

    # Find the min and max from the complete set of data
    min_log_cap = log_all_caps.min()
    max_log_cap = log_all_caps.max()
    log_cap_range = max_log_cap - min_log_cap

    # Now, apply the normalization ONLY to the DataFrame's data
    # using the min/max from the combined set
    if log_cap_range > 0:
        # We are calculating log on just the dataframe column now
        log_df_caps = np.log(final_filtered_df['Market Cap'].clip(lower=epsilon))
        final_filtered_df['Planet Radius'] = (log_df_caps - min_log_cap) / log_cap_range
    else:
        # If all values are the same, assign a default radius
        final_filtered_df['Planet Radius'] = 0.5

    # Calculate source_planet_radius using the same min/max log caps from the targets and source.
    if log_cap_range > 0:
        source_planet_radius = (source_log_cap - min_log_cap) / log_cap_range
    else:
        source_planet_radius = 0.5 # Neutral value if all caps are the same

    # --- Final Cleanup and Column Selection ---
    # "gravitational_percent" shows the relative % contribution of each stock.
    final_filtered_df['gravitational_percent'] = (final_filtered_df['signed_gravitational_force'] / final_filtered_df['gravitational_force'].sum()) * 100

    final_columns = [
        'source', 'target', 'Daily Change', 'six_month_spearman_correlation',
        'three_month_spearman_correlation', 'unified_correlation',
        'Orbital Radius', 'Market Cap', 'Planet Radius', 'market_cap_influence',
        'gravitational_force', 'signed_gravitational_force', 'gravitational_percent'
    ]


    gravitational_impact = (net_gravitational_force / max_potential_force) * 100 if max_potential_force > 0 else 0

    # Use the same min_log_cap and log_cap_range from target stocks for scaling
    source_market_cap_influence = 20 if log_cap_range <= 0 else (source_log_cap)

    # Create source_data_df
    source_data_df = pd.DataFrame([{
        'ticker': source_ticker,
        'net_gravitational_force': net_gravitational_force,
        'max_potential_force': max_potential_force,
        'gravitational_impact': gravitational_impact,
        'source_market_cap_influence': source_market_cap_influence, # Add the source influence
        'source_planet_radius': source_planet_radius # Add the source planet radius
    }])


    for col in final_columns:
        if col not in final_filtered_df.columns:
            final_filtered_df[col] = np.nan

    processed_data_df = final_filtered_df[final_columns].copy()

    return processed_data_df, source_data_df


# ## ---------- MODIFIED: Run App ---------------
# min_nodes = 5
# max_nodes = 30
# threshold_percent = 0.9

# # User input Ticker
# source_ticker = 'AAPL'

# # Process the data for the network diagram
# processed_data_df, source_data_df = process_and_score_stocks(
#     six_month_spearman_lagged_correlations,
#     three_month_spearman_lagged_correlations,
#     screener_data_df,
#     source_ticker,
#     min_nodes,
#     max_nodes,
#     threshold_percent,
#     )

# # Extract the scalar values from the source_data_df for plotting
# net_gravitational_force = source_data_df['net_gravitational_force'].iloc[0]
# max_potential_force = source_data_df['max_potential_force'].iloc[0]
# gravitational_impact = source_data_df['gravitational_impact'].iloc[0]
# market_cap_influence = source_data_df['source_market_cap_influence'].iloc[0]
# source_planet_radius = source_data_df['source_planet_radius'].iloc[0]

# print(f"Net Gravitational Force: {net_gravitational_force:.2f}")
# print(f"Max Potential Gravitational Force: {max_potential_force:.2f}")
# print(f"Net Gravitaional Impact: {gravitational_impact:.2f}%")
# print(f"Source Market Cap Influence: {market_cap_influence}")
# print(f"Source Planet Radius: {source_planet_radius}")
# print('----------------------------------')
# processed_data_df

##List of Top Predictions

In [None]:
# SaveFile

#Solar System Parameters
min_nodes = 5
max_nodes = 30
threshold_percent = 0.9

#imports
import pandas as pd
top_gravitational_impacts = []

# Ensure unified_correlation_df is created from six_month_spearman_lagged_correlations
# assuming unified_correlation_df is meant to represent the base correlations
# based on the usage in the provided code for process_and_score_stocks.
# If unified_correlation_df should be different, please adjust this.

# Save Path
SAVE_PATH_TOP_PREDICTIONS = '/content/drive/MyDrive/Colab Notebooks/Production/top_gravitational_impacts.csv'

unified_correlation_df = six_month_spearman_lagged_correlations.copy()


# Iterate through each ticker in the unified_correlation_df
for ticker in tqdm(unified_correlation_df.index, desc="Processing tickers for gravitational impact"):
    try:
        # Run each ticker through the process_and_score_stocks function
        # Note: The function previously returned processed_data_df and prediction_score.
        # It now returns processed_data_df, net_gravitational_force, max_potential_force, gravitational_impact
        # We need to unpack the new return values.
        processed_df, source_data = process_and_score_stocks(
            six_month_spearman_lagged_correlations, # Pass the 6-month correlation
            three_month_spearman_lagged_correlations, # Pass the 3-month correlation
            screener_data_df,
            ticker, # Use the current ticker as source_ticker
            min_nodes,
            max_nodes,
            threshold_percent,
        )

        # Append the results to the list
        # Access the scalar values from the returned source_data DataFrame
        if not source_data.empty:
            top_gravitational_impacts.append({
                'ticker': ticker,
                'net_gravitational_force': source_data['net_gravitational_force'].iloc[0],
                'max_potential_force': source_data['max_potential_force'].iloc[0],
                'gravitational_impact': source_data['gravitational_impact'].iloc[0]
            })
    except Exception as e:
        print(f"Error processing ticker {ticker}: {e}")

# Create a DataFrame from the results
gravitational_impact_df = pd.DataFrame(top_gravitational_impacts)

# # OLD Save File
# gravitational_impact_df.to_csv(SAVE_PATH_TOP_PREDICTIONS, index=False)


#----------Save to Cloud Storage-----------
#------Change-------#
# Let's use your desired file name for the base, e.g., 'nasdaq_df'
base_file_name = "gravitational_impact_df" # You can make this an env var too if you want!
gcs_file_name = f"{base_file_name}_{GCS_FILE_EXTENSION}"


print(f"Preparing to save data to GCS: gs://{actual_gcs_bucket_name}/{gcs_file_name}")

# Step 1: Get a GCS client
storage_client = storage.Client()
bucket = storage_client.bucket(actual_gcs_bucket_name)
blob = bucket.blob(gcs_file_name) # Define the "file" (blob) in your bucket

#------Change--------#
# Step 2: Convert your DataFrame to a CSV string in memory
# This is key! We're not writing to a local file, but to a string variable.
csv_string = gravitational_impact_df.to_csv(index=False) # index=False is good, as you had it!

# Step 3: Upload the CSV string to GCS
blob.upload_from_string(csv_string, content_type='text/csv')

print(f"Successfully saved data to gs://{actual_gcs_bucket_name}/{gcs_file_name}")



Processing tickers for gravitational impact:   0%|          | 0/262 [00:00<?, ?it/s]