1. Setup

In [1]:
import pandas as pd
import numpy as np
import psycopg2
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime, timedelta
import time

# Define your date range and terms
# Get today's date (end_date)
end_date = (datetime.today() + timedelta(days=1)).strftime('%Y-%m-%d')

# Get start_date (90 days before today)
start_date = time.strftime('%Y-%m-%d', time.localtime(time.time() - 200 * 86400))
# Calculate the extended date range for sentiment analysis
extended_start_date = pd.to_datetime(start_date) - pd.Timedelta(days=21)

# Convert start and end dates to Timestamp for consistency and compatibility
start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)


grp_terms = ['SOL', 'KAS', 'LINK', 'ADA', 'MATIC', 'AVAX', 'POPCAT', 'SUI', 'HNT', 'WIF', 'BTC', 'DOGE','ETH' 
             ,'GME', 'NVDA','JPM', 'GOOGL','DXY', 'TSMC', 'CVX', 'COIN', 'AMZN', 'MSFT', 'NFLX', 'DIS', 'AAPL', 'TSLA'
            ]

# Initialize a dictionary to collect DataFrames for each term
combined_data_dict = {}
# Create dictionaries to store different types of data
price_data_dict = {}  # Stores the raw price data
technical_indicators_dict = {}  # Stores the extended data with technical indicators
filtered_data_dict = {}  # Stores the filtered data after applying the technical indicators

# Database connection parameters
db_params = {
    'dbname': 'twt_snt',
    'user': 'postgres',
    'password': 'Ilpmnl!69gg',
    'host': 'localhost',
    'port': '5432'
}

# Function to fetch tweets from the database
def fetch_tweets(start_date, end_date, term):
    try:
        conn = psycopg2.connect(**db_params)
        cursor = conn.cursor()
        query = """
            SELECT * FROM twt_tbl
            WHERE term = %s AND date BETWEEN %s AND %s
        """
        cursor.execute(query, (term, extended_start_date, end_date))
        rows = cursor.fetchall()
        columns = [desc[0] for desc in cursor.description]
        df = pd.DataFrame(rows, columns=columns)
        return df
    except Exception as e:
        print(f"Error fetching tweets: {e}")
        return None
    finally:
        cursor.close()
        conn.close()

# Function to fetch moving averages from the database
def fetch_moving_averages(extended_start_date, end_date, term):
    try:
        conn = psycopg2.connect(**db_params)
        cursor = conn.cursor()
        query = """
            SELECT date, term, combined_compound_ma_7, combined_compound_ma_21, 
                   combined_compound_ma_50, combined_compound_ma_100, combined_compound_ma_200, combined_compound
            FROM snt_ma_blend_tbl
            WHERE term = %s AND date BETWEEN %s AND %s
        """
        cursor.execute(query, (term, extended_start_date, end_date))
        rows = cursor.fetchall()
        columns = [desc[0] for desc in cursor.description]
        df = pd.DataFrame(rows, columns=columns)
        return df
    except Exception as e:
        print(f"Error fetching moving averages: {e}")
        return None
    finally:
        cursor.close()
        conn.close()

# Function to fetch data from the database
def fetch_price_data(start_date, end_date, term):
    try:
        conn = psycopg2.connect(**db_params)
        cursor = conn.cursor()
        query = """
            SELECT date, term, open, high, low, close, adj_close, volume,
            close_ma_7, close_ma_21, close_ma_50, close_ma_100, close_ma_200
            FROM yahoo_price_tbl
            WHERE term = %s AND date BETWEEN %s AND %s
        """
        cursor.execute(query, (term, start_date, end_date))
        rows = cursor.fetchall()
        columns = [desc[0] for desc in cursor.description]
        df = pd.DataFrame(rows, columns=columns)
        return df
    except Exception as e:
        print(f"Error fetching price data: {e}")
        return None
    finally:
        cursor.close()
        conn.close()

def fetch_signals(start_date, end_date, term):
    try:
        conn = psycopg2.connect(**db_params)
        cursor = conn.cursor()
        query = """
            SELECT date, term, buy, sell, neutral
            FROM signal_cnt_tbl
            WHERE term = %s AND date BETWEEN %s AND %s
        """
        cursor.execute(query, (term, start_date, end_date))
        rows = cursor.fetchall()
        columns = [desc[0] for desc in cursor.description]
        df = pd.DataFrame(rows, columns=columns)

        # Ensure datetime format and indexing
        df['date'] = pd.to_datetime(df['date'])
        df.set_index('date', inplace=True)

        return df
    except Exception as e:
        print(f"Error fetching signals for term {term}: {e}")
        return pd.DataFrame()
    finally:
        cursor.close()
        conn.close()        

# Functions to calculate technical indicators (as given)
def calculate_rsi(series, period=14):
    delta = series.diff(1)
    gain = delta.where(delta > 0, 0.0)
    loss = -delta.where(delta < 0, 0.0)
    avg_gain = gain.rolling(window=period, min_periods=1).mean()
    avg_loss = loss.rolling(window=period, min_periods=1).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

def calculate_stochastic_rsi(df, rsi_column, window=14):
    rsi_min = df[rsi_column].rolling(window=window, min_periods=1).min()
    rsi_max = df[rsi_column].rolling(window=window, min_periods=1).max()
    stoch_rsi = (df[rsi_column] - rsi_min) / (rsi_max - rsi_min)
    return stoch_rsi * 100

def calculate_mfi(df, window=14):
    typical_price = (df['high'] + df['low'] + df['close']) / 3
    money_flow = typical_price * df['volume']
    positive_flow = (money_flow.where(typical_price > typical_price.shift(1), 0)).rolling(window=window).sum()
    negative_flow = (money_flow.where(typical_price < typical_price.shift(1), 0)).rolling(window=window).sum()
    mfi = 100 - (100 / (1 + positive_flow / negative_flow))
    return mfi

def calculate_macd(series, short_window=12, long_window=26, signal_window=9):
    short_ema = series.ewm(span=short_window, adjust=False).mean()
    long_ema = series.ewm(span=long_window, adjust=False).mean()
    macd = short_ema - long_ema
    signal = macd.ewm(span=signal_window, adjust=False).mean()
    return macd, signal


def calculate_bollinger_bands(df, ma_column, window=20, num_std_dev=2, band_type='price'):
    df[f'{band_type}_MA'] = df[ma_column]
    df[f'{band_type}_STD'] = df[ma_column].rolling(window=window).std()
    df[f'{band_type}_Upper_Band'] = df[f'{band_type}_MA'] + (df[f'{band_type}_STD'] * num_std_dev)
    df[f'{band_type}_Lower_Band'] = df[f'{band_type}_MA'] - (df[f'{band_type}_STD'] * num_std_dev)
    return df

def scale_features_to_price(df, columns_to_scale, reference_column):
    scaled_columns = {}
    for col in columns_to_scale:
        scaler = MinMaxScaler(feature_range=(df[reference_column].min(), df[reference_column].max()))
        scaled_columns[f'scaled_{col}'] = scaler.fit_transform(df[[col]]).flatten()
    
    # Convert scaled_columns dictionary to a DataFrame
    scaled_df = pd.DataFrame(scaled_columns, index=df.index)
    
    # Concatenate the original DataFrame with the scaled DataFrame
    return pd.concat([df, scaled_df], axis=1)

def calculate_boll_upper_advanced(boll_upper_price, boll_lower_sent, boll_upper_sent, boll_lower_price):
    if boll_upper_price >= boll_lower_sent and boll_upper_sent >= boll_lower_price:
        # When the bands overlap, take the minimum of the upper bounds, but ensure it's above the lower bound
        return max(min(boll_upper_price, boll_upper_sent), boll_lower_price)
    else:
        # When there's no overlap, choose the upper band that is closer to the other band's lower boundary
        if abs(boll_upper_price - boll_lower_sent) < abs(boll_upper_sent - boll_lower_price):
            return max(boll_upper_price, boll_lower_price)
        else:
            return max(boll_upper_sent, boll_lower_price)

def calculate_boll_lower_advanced(boll_lower_price, boll_upper_sent, boll_lower_sent, boll_upper_price):
    if boll_lower_price <= boll_upper_sent and boll_lower_sent <= boll_upper_price:
        # When the bands overlap, take the maximum of the lower bounds, but ensure it's below the upper bound
        return min(max(boll_lower_price, boll_lower_sent), boll_upper_price)
    else:
        # When there's no overlap, choose the lower band that is closer to the other band's upper boundary
        if abs(boll_lower_price - boll_upper_sent) < abs(boll_lower_sent - boll_upper_price):
            return min(boll_lower_price, boll_upper_price)
        else:
            return min(boll_lower_sent, boll_upper_price)
        
def normalize_column(df, column):
    min_val = df[column].min()
    max_val = df[column].max()
    return ((df[column] - min_val) / (max_val - min_val)) * 100
        
# Function to find divergence
def find_MACD_price_divergence(df):
    divergence = ['None']  # Start with 'None' for the first row since no comparison can be made
    for i in range(1, len(df)):
        if df['close'].iloc[i] < df['close'].iloc[i-1] and df['MACD'].iloc[i] > df['MACD'].iloc[i-1]:
            divergence.append('Bullish MACD Price Divergence')
        elif df['close'].iloc[i] > df['close'].iloc[i-1] and df['MACD'].iloc[i] < df['MACD'].iloc[i-1]:
            divergence.append('Bearish MACD Price Divergence')
        else:
            divergence.append('None')
    return divergence

# Function to find divergence
def find_MACD_sentiment_divergence(df):
    divergence = ['None']  # Start with 'None' for the first row since no comparison can be made
    for i in range(1, len(df)):
        if df['close'].iloc[i] < df['close'].iloc[i-1] and df['Sentiment_MACD'].iloc[i] > df['Sentiment_MACD'].iloc[i-1]:
            divergence.append('Bullish MACD Sentiment Divergence')
        elif df['close'].iloc[i] > df['close'].iloc[i-1] and df['Sentiment_MACD'].iloc[i] < df['Sentiment_MACD'].iloc[i-1]:
            divergence.append('Bearish MACD Sentiment Divergence')
        else:
            divergence.append('None')
    return divergence


# Function to find divergence
def find_RSI_price_divergence(df):
    divergence = ['None']  # Start with 'None' for the first row since no comparison can be made
    for i in range(1, len(df)):
        if df['close'].iloc[i] < df['close'].iloc[i-1] and df['RSI'].iloc[i] > df['RSI'].iloc[i-1]:
            divergence.append('Bullish RSI Price Divergence')
        elif df['close'].iloc[i] > df['close'].iloc[i-1] and df['RSI'].iloc[i] < df['RSI'].iloc[i-1]:
            divergence.append('Bearish RSI Price Divergence')
        else:
            divergence.append('None')
    return divergence

# Function to find divergence
def find_RSI_sentiment_divergence(df):
    divergence = ['None']  # Start with 'None' for the first row since no comparison can be made
    for i in range(1, len(df)):
        if df['close'].iloc[i] < df['close'].iloc[i-1] and df['Sentiment_RSI'].iloc[i] > df['Sentiment_RSI'].iloc[i-1]:
            divergence.append('Bullish RSI Sentiment Divergence')
        elif df['close'].iloc[i] > df['close'].iloc[i-1] and df['Sentiment_RSI'].iloc[i] < df['Sentiment_RSI'].iloc[i-1]:
            divergence.append('Bearish RSI Sentiment Divergence')
        else:
            divergence.append('None')
    return divergence

        
        
        



2. Data Transform

In [2]:

import pandas as pd
import numpy as np

# Define dictionaries to store different results
moving_averages_dict = {}
scaled_features_dict = {}
combined_data_dict = {}

# Iterate over each term in the group
for grp_term in grp_terms:
    print(f"Processing term: {grp_term}")

    # Step 1: Fetch price data for the extended range
    extended_price_df = fetch_price_data(extended_start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d'), grp_term)
    
    if extended_price_df is None or extended_price_df.empty:
        print(f"No price data found for term: {grp_term}")
        continue
    # Also Step 1: Fetch signals data for the extended range    
    signals_df = fetch_signals(start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d'), grp_term)

    # Step 2: Set 'date' as the index and ensure it's unique
    extended_price_df['date'] = pd.to_datetime(extended_price_df['date'])
    extended_price_df.set_index('date', inplace=True)
    extended_price_df = extended_price_df.loc[~extended_price_df.index.duplicated(keep='first')]

    # Step 3: Fetch moving averages data
    ma_db_df = fetch_moving_averages(start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d'), grp_term)

    if ma_db_df is not None and not ma_db_df.empty:
        ma_db_df['date'] = pd.to_datetime(ma_db_df['date'])
        ma_db_df.set_index('date', inplace=True)

        # Drop the 'term' column from extended_price_df to avoid duplication during the join
        extended_price_df.drop(columns=['term'], errors='ignore', inplace=True)

        # Join moving average data to extended price data
        combined_data_df = ma_db_df.join(extended_price_df, how='left')

        # DATE RELATED ERROR CORRECTING
        # Filter out rows with dates in the future
        current_date = pd.Timestamp.now().normalize()
        # combined_data_df = combined_data_df[combined_data_df.index <= current_date]
        # Ensure the index is a proper `DatetimeIndex` and sorted
        combined_data_df.index = pd.to_datetime(combined_data_df.index)
        combined_data_df = combined_data_df.sort_index()    

    
        # Step 4: Calculate Technical Indicators
        combined_data_df['RSI'] = calculate_rsi(combined_data_df['close'])
        combined_data_df['Stochastic_RSI'] = calculate_stochastic_rsi(combined_data_df, 'RSI')
        combined_data_df['MFI'] = calculate_mfi(combined_data_df)
        combined_data_df['MACD'], combined_data_df['MACD_Signal'] = calculate_macd(combined_data_df['close'])
        
        # Step 4.1: Calculate Sentiment Technical Indicators
        # Calculate RSI for sentiment data using 'daily_avg_combined_compound'
        combined_data_df['Sentiment_RSI'] = calculate_rsi(combined_data_df['combined_compound'])
        combined_data_df['Sentiment_Stochastic_RSI'] = calculate_stochastic_rsi(combined_data_df, 'Sentiment_RSI')
        combined_data_df['Sentiment_MACD'], combined_data_df['Sentiment_MACD_Signal'] = calculate_macd(combined_data_df['combined_compound'])
        
        # Step 4.2-4.6: Calculate Sentiment Technical Indicators supporting features
        # Adding stdev for RSI and Sentiment_RSI
        #Step 4.21: Calculate the difference between RSI and Sentiment_RSI
        combined_data_df['RSI_Difference'] = combined_data_df['RSI'] - combined_data_df['Sentiment_RSI']

        # Step 4.22: Calculate the rolling standard deviation of this difference
        # You can specify the window size (e.g., 14 days) for the rolling standard deviation
        combined_data_df['RSI_Sentiment_STD'] = combined_data_df['RSI_Difference'].rolling(window=14).std().abs()
        # Now, combined_data_df['RSI_Sentiment_STD'] contains the standard deviation between RSI and Sentiment_RSI
        
        # Step 4.23: Calculate the rolling mean of the RSI_Difference
        combined_data_df['RSI_Difference_Mean'] = combined_data_df['RSI_Difference'].rolling(window=14).mean()

        # Step 4.24: Calculate the number of standard deviations from the mean
        combined_data_df['RSI_Difference_STD_Deviation'] = (
            (combined_data_df['RSI_Difference'] - combined_data_df['RSI_Difference_Mean']) /
            combined_data_df['RSI_Sentiment_STD']
        )
        # Now, combined_data_df['RSI_Difference_STD_Deviation'] contains the number of standard deviations from the mean for each day
        
        # Step 4.24: Identify if RSI_Difference_STD_Deviation is greater than 2
        combined_data_df['RSI_STD_above_2'] = abs(combined_data_df['RSI_Difference_STD_Deviation']) > 2

        # Step 4.25: execute the divergence function
        combined_data_df['RSI_Price_Divergence'] = find_RSI_price_divergence(combined_data_df)
        combined_data_df['RSI_Sentiment_Divergence'] = find_RSI_sentiment_divergence(combined_data_df)
        
        # Step 4.26: count the divergence recorded in the divergence function
        # Calculate consecutive counts for RSI Price divergence, similar to what was done with MACD
        combined_data_df['Consecutive_Count_RSI_Price_Divergence'] = (
            combined_data_df['RSI_Price_Divergence']
            .apply(lambda x: x if x != 'None' else None)
            .groupby((combined_data_df['RSI_Price_Divergence'] != combined_data_df['RSI_Price_Divergence'].shift()).cumsum())
            .cumcount()
            .where(combined_data_df['RSI_Price_Divergence'] != 'None', 0)
        )
        
        # Step 4.26A: Calculate consecutive counts for RSI Sentiment divergence, similar to what is done with MACD
        combined_data_df['Consecutive_Count_RSI_Sentiment_Divergence'] = (
            combined_data_df['RSI_Sentiment_Divergence']
            .apply(lambda x: x if x != 'None' else None)
            .groupby((combined_data_df['RSI_Sentiment_Divergence'] != combined_data_df['RSI_Sentiment_Divergence'].shift()).cumsum())
            .cumcount()
            .where(combined_data_df['RSI_Sentiment_Divergence'] != 'None', 0)
        )
        
        # Step 4.27 calculate RSI_Trend_Reversal variable
        combined_data_df['RSI_Overbought'] = (combined_data_df['RSI'] > 70) & (combined_data_df['Sentiment_RSI'] > 70)
        combined_data_df['RSI_Oversold'] = (combined_data_df['RSI'] < 30) & (combined_data_df['Sentiment_RSI'] < 30)

        # Create conditions for divergence
        combined_data_df['Bearish_Divergence'] = (combined_data_df['RSI_Price_Divergence'] == 'Bearish RSI Price Divergence') & (combined_data_df['RSI_Sentiment_Divergence'] == 'Bearish RSI Sentiment Divergence')
        combined_data_df['Bullish_Divergence'] = (combined_data_df['RSI_Price_Divergence'] == 'Bullish RSI Price Divergence') & (combined_data_df['RSI_Sentiment_Divergence'] == 'Bullish RSI Sentiment Divergence')

        # Create a new column for RSI Trend Reversal based on overbought/oversold levels and divergence
        combined_data_df['RSI_Trend_Reversal'] = np.where(
            (combined_data_df['RSI_Overbought'] & combined_data_df['Bearish_Divergence']),
            'Likely Downward Reversal',  # Bearish reversal when both RSI and Sentiment_RSI are overbought and bearish divergence occurs
            np.where(
                (combined_data_df['RSI_Oversold'] & combined_data_df['Bullish_Divergence']),
                'Likely Upward Reversal',  # Bullish reversal when both RSI and Sentiment_RSI are oversold and bullish divergence occurs
                'No Reversal'  # Default value when no reversal condition is met
            )
        )
        
        # Step 4.3: Calculate the rolling mean of the stoch RSI_Difference
        # Apply smoothing to the Sentiment_Stochastic_RSI using a moving average or EMA; Here we use a 2-period EMA for smoothing
        smoothing_window = 2
        combined_data_df['Smoothed_Sentiment_Stochastic_RSI'] = combined_data_df['Sentiment_Stochastic_RSI'].ewm(span=smoothing_window, adjust=False).mean()

        # Adding stdev for RSI and Sentiment_RSI
        # Step 4.31: Calculate the difference between Stochastic_RSI and Sentiment_Stochastic_RSI
        combined_data_df['Stoch_RSI_Difference'] = combined_data_df['Stochastic_RSI'] - combined_data_df['Sentiment_Stochastic_RSI']

        
        # Step 4.32: Calculate the rolling standard deviation of this difference
        # You can specify the window size (e.g., 14 days) for the rolling standard deviation
        combined_data_df['Stoch_RSI_Sentiment_STD'] = combined_data_df['Stoch_RSI_Difference'].rolling(window=14).std().abs()

        # Step 4.33: Calculate the rolling mean of the RSI_Difference
        combined_data_df['Stoch_RSI_Difference_Mean'] = combined_data_df['Stoch_RSI_Difference'].rolling(window=14).mean()

        # Step 4.34: Calculate the number of standard deviations from the mean
        combined_data_df['Stoch_RSI_Difference_STD_Deviation'] = (
            (combined_data_df['Stoch_RSI_Difference'] - combined_data_df['Stoch_RSI_Difference_Mean']) /
            combined_data_df['Stoch_RSI_Sentiment_STD']
        )
        
        # Step 4.35: Identify if Stoch_RSI_Difference_STD_Deviation is greater than 2
        combined_data_df['Stoch_RSI_STD_above_2'] = abs(combined_data_df['Stoch_RSI_Difference_STD_Deviation']) > 2
        
        #  # Step 4.365: count how many periods stoch RSI is in extreme position
        # Initialize the counter columns
        combined_data_df['Stoch_RSI_Both_Extreme_Counter'] = 0

        # Iterate through the DataFrame to update the counter
        for i in range(1, len(combined_data_df)):
            if combined_data_df.iloc[i]['Stochastic_RSI'] > 80 and combined_data_df.iloc[i]['Sentiment_Stochastic_RSI'] > 80:
                if combined_data_df.iloc[i-1]['Stoch_RSI_Both_Extreme_Counter'] > 0:  # Continuation of a positive streak
                    combined_data_df.iloc[i, combined_data_df.columns.get_loc('Stoch_RSI_Both_Extreme_Counter')] = combined_data_df.iloc[i-1]['Stoch_RSI_Both_Extreme_Counter'] + 1
                else:  # Start of a new positive streak
                    combined_data_df.iloc[i, combined_data_df.columns.get_loc('Stoch_RSI_Both_Extreme_Counter')] = 1
            elif combined_data_df.iloc[i]['Stochastic_RSI'] < 20 and combined_data_df.iloc[i]['Sentiment_Stochastic_RSI'] < 20:
                if combined_data_df.iloc[i-1]['Stoch_RSI_Both_Extreme_Counter'] < 0:  # Continuation of a negative streak
                    combined_data_df.iloc[i, combined_data_df.columns.get_loc('Stoch_RSI_Both_Extreme_Counter')] = combined_data_df.iloc[i-1]['Stoch_RSI_Both_Extreme_Counter'] - 1
                else:  # Start of a new negative streak
                    combined_data_df.iloc[i, combined_data_df.columns.get_loc('Stoch_RSI_Both_Extreme_Counter')] = -1
            else:
                combined_data_df.iloc[i, combined_data_df.columns.get_loc('Stoch_RSI_Both_Extreme_Counter')] = 0  # Reset the counter if the condition is not met or both are zero

            # Additional condition to reset counter if both values are exactly zero
            if combined_data_df.iloc[i]['Stochastic_RSI'] == 0 and combined_data_df.iloc[i]['Sentiment_Stochastic_RSI'] == 0:
                combined_data_df.iloc[i, combined_data_df.columns.get_loc('Stoch_RSI_Both_Extreme_Counter')] = 0
        
        
        # Step 4.40 Calculate extra goodies for the MACD 
        # Scale Sentiment_MACD to the scale of MACD
        macd_scaler = MinMaxScaler(feature_range=(combined_data_df['MACD'].min(), combined_data_df['MACD'].max()))
        combined_data_df['scaled_Sentiment_MACD'] = macd_scaler.fit_transform(combined_data_df[['Sentiment_MACD']]).flatten()

        # Step 4.41 Scale Sentiment_MACD_Signal to the scale of MACD_Signal
        macd_signal_scaler = MinMaxScaler(feature_range=(combined_data_df['MACD_Signal'].min(), combined_data_df['MACD_Signal'].max()))
        combined_data_df['scaled_Sentiment_MACD_Signal'] = macd_signal_scaler.fit_transform(combined_data_df[['Sentiment_MACD_Signal']]).flatten()

        #Step 4.42 Calculate the MACD histogram for price data
        combined_data_df['MACD_Histogram'] = combined_data_df['MACD'] - combined_data_df['MACD_Signal']

        #Step 4.43 Calculate the MACD histogram for sentiment data
        combined_data_df['Sentiment_MACD_Histogram'] = combined_data_df['scaled_Sentiment_MACD'] - combined_data_df['scaled_Sentiment_MACD_Signal']

        #Step 4.44: Calculate the difference between MACD_Signal and Sentiment_MACD_Signal
        combined_data_df['MACD_Signal_Difference'] = combined_data_df['MACD_Signal'] - combined_data_df['scaled_Sentiment_MACD_Signal']

        #Step 4.45: Calculate the rolling standard deviation of this difference
        combined_data_df['MACD_Signal_Sentiment_STD'] = combined_data_df['MACD_Signal_Difference'].rolling(window=14).std().abs()

        #Step 4.46: Calculate the rolling mean of the MACD_Signal_Difference
        combined_data_df['MACD_Signal_Difference_Mean'] = combined_data_df['MACD_Signal_Difference'].rolling(window=14).mean()

        #Step 4.46: Calculate the number of standard deviations from the mean
        combined_data_df['MACD_Signal_Difference_STD_Deviation'] = (
            (combined_data_df['MACD_Signal_Difference'] - combined_data_df['MACD_Signal_Difference_Mean']) /
            combined_data_df['MACD_Signal_Sentiment_STD']
        )

        # Step 4.47: Identify if MACD_Signal_Difference_STD_Deviation is greater than 2
        macd_signal_condition_above_2 = abs(combined_data_df['MACD_Signal_Difference_STD_Deviation']) > 2

        # Step 4.48: Record the MACD_Signal_Difference_STD_Deviation directly to the DataFrame
        combined_data_df['MACD_Signal_trend_reversal'] = np.where(
            macd_signal_condition_above_2,
            combined_data_df['MACD_Signal_Difference_STD_Deviation'],
            0
        )

        # Step 4.49: Identify if there is a cross between Sentiment_MACD_Signal and MACD_Signal and record it
        # Capture positive and negative crosses for future analysis in Tableau
        combined_data_df['MACD_Signal_Cross'] = np.where(
            (combined_data_df['scaled_Sentiment_MACD_Signal'] > combined_data_df['MACD_Signal']) &
            (combined_data_df['scaled_Sentiment_MACD_Signal'].shift(1) <= combined_data_df['MACD_Signal'].shift(1)),
            1,  # Bullish cross
            np.where(
                (combined_data_df['scaled_Sentiment_MACD_Signal'] < combined_data_df['MACD_Signal']) &
                (combined_data_df['scaled_Sentiment_MACD_Signal'].shift(1) >= combined_data_df['MACD_Signal'].shift(1)),
                -1,  # Bearish cross
                0  # No cross
            )
        )

        # Step 4.491: Create a new column to capture the significance of the cross using both deviation and the MACD difference
        combined_data_df['MACD_Cross_Significance'] = np.where(
            combined_data_df['MACD_Signal_Cross'] != 0,
            combined_data_df['MACD_Signal_trend_reversal'] * combined_data_df['MACD_Signal_Cross'],
            0
        )

        # Step 4.492: Determine the direction of the Sentiment MACD Signal
        combined_data_df['Sentiment_MACD_Signal_Direction'] = np.where(
            combined_data_df['scaled_Sentiment_MACD_Signal'] > combined_data_df['scaled_Sentiment_MACD_Signal'].shift(1),
            1,  # Upward direction
            np.where(
                combined_data_df['scaled_Sentiment_MACD_Signal'] < combined_data_df['scaled_Sentiment_MACD_Signal'].shift(1),
                -1,  # Downward direction
                0  # No change
            )
        )
        

        #Step 4.493 Find MACD Price divergence and store it in the DataFrame
        combined_data_df['MACD_Price_Divergence'] = find_MACD_price_divergence(combined_data_df)
        combined_data_df['Consecutive_Count_MACD_Price_Divergence'] = combined_data_df['MACD_Price_Divergence'].apply(lambda x: x if x != 'None' else None).groupby((combined_data_df['MACD_Price_Divergence'] != combined_data_df['MACD_Price_Divergence'].shift()).cumsum()).cumcount().where(combined_data_df['MACD_Price_Divergence'] != 'None', 0)
 
        #Step 4.494 Find MACD sentiment divergence and store it in the DataFrame
        combined_data_df['MACD_Sentiment_Divergence'] = find_MACD_sentiment_divergence(combined_data_df)
        combined_data_df['Consecutive_Count_MACD_Sentiment_Divergence'] = combined_data_df['MACD_Sentiment_Divergence'].apply(lambda x: x if x != 'None' else None).groupby((combined_data_df['MACD_Sentiment_Divergence'] != combined_data_df['MACD_Sentiment_Divergence'].shift()).cumsum()).cumcount().where(combined_data_df['MACD_Sentiment_Divergence'] != 'None', 0)
 
        # Step 7: Scale selected sentiment features to match the scale of the 'close' price
        columns_to_scale = [
            'combined_compound_ma_7', 'combined_compound_ma_21', 'combined_compound_ma_50',
            'combined_compound_ma_100', 'combined_compound_ma_200','combined_compound'
        ]
        combined_data_df = scale_features_to_price(combined_data_df, columns_to_scale, 'close')
        
        # Step 5: Generate previous day and trend indicators
        ma_columns = [col for col in combined_data_df.columns if 'combined_compound' in col or 'close_ma' in col or 'scaled_combined_compound' in col]

        for ma_column in ma_columns:
            combined_data_df[f'prev_{ma_column}'] = combined_data_df[ma_column].shift(1)
            combined_data_df[f'{ma_column}_trend'] = (combined_data_df[ma_column] > combined_data_df[f'prev_{ma_column}']).astype(int)
            combined_data_df[f'{ma_column}_pct_change'] = combined_data_df[ma_column].pct_change() * 100
            combined_data_df[f'{ma_column}_direction_change_flag'] = combined_data_df[f'{ma_column}_trend'].diff().apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))

        # Step 6: Volume Moving Averages
        ma_vol_df = combined_data_df[['volume']].copy()
        for ma in [7, 20]:
            ma_vol_df[f'{ma}_day_MA_Volume'] = ma_vol_df['volume'].rolling(window=ma, min_periods=1).mean()

        # Join Volume moving averages
        combined_data_df = combined_data_df.join(ma_vol_df, how='left', rsuffix='_vol')

        # Fill NaN values for moving average columns
        for column in combined_data_df.columns:
            if '_MA' in column:
                combined_data_df[column].fillna(method='ffill', inplace=True)
                combined_data_df[column].fillna(method='bfill', inplace=True)
           
        # fill Close for Bollinger band display only
        combined_data_df['close_fill'] = combined_data_df['close']        
        fill_columns = ['close_fill']

        for column in fill_columns:
                combined_data_df[column].fillna(method='ffill', inplace=True)
                combined_data_df[column].fillna(method='bfill', inplace=True)
        

        # Create High Volume Flags
        combined_data_df['High_Volume_7'] = (combined_data_df['volume'] > combined_data_df['7_day_MA_Volume']).astype(int)
        combined_data_df['High_Volume_20'] = (combined_data_df['volume'] > combined_data_df['20_day_MA_Volume']).astype(int)

        # Apply forward fill and backfill for numerical values
        # combined_data_df.fillna(method='ffill', inplace=True)
        #combined_data_df.fillna(method='bfill', inplace=True)



        # Step 8: Add trend columns for scaled values
        combined_data_df['3_day_avg_combined_compound_ma_7'] = combined_data_df['combined_compound_ma_7'].rolling(window=3).mean()
        combined_data_df['3_day_avg_combined_compound_ma_7_prev'] = combined_data_df['3_day_avg_combined_compound_ma_7'].shift(1)
        combined_data_df['3_day_avg_combined_compound_ma_7_trend'] = (combined_data_df['3_day_avg_combined_compound_ma_7'] > combined_data_df['3_day_avg_combined_compound_ma_7_prev']).astype(int)

        # Step 9: Calculate differences between scaled values and close moving averages
        if 'close_ma_7' in combined_data_df.columns and 'scaled_combined_compound_ma_7' in combined_data_df.columns:
            combined_data_df['ma_7_diff'] = combined_data_df['close_ma_7'] - combined_data_df['scaled_combined_compound_ma_7']
            mean_difference = combined_data_df['ma_7_diff'].mean()
            std_difference = combined_data_df['ma_7_diff'].std()
            combined_data_df['ma_7_diff_std'] = (combined_data_df['ma_7_diff'] - mean_difference) / std_difference
        else:
            print(f"Required columns not found for {grp_term}. Skipping difference calculations.")
            
            
        # Use the extended data for Bollinger Band calculation
        # Step 2: Calculate the standard deviation for sentiment using 'combined_compound_ma_7' over the extended data range
        combined_data_df['sentiment_STD'] = combined_data_df['combined_compound_ma_7'].rolling(window=20).std()

        # Step 3: Calculate the sentiment Upper and Lower Bollinger Bands using the extended data
        sentiment_ma_column = 'combined_compound_ma_7'
        num_std_dev = 2
        combined_data_df['sentiment_Upper_Band'] = combined_data_df[sentiment_ma_column] + (combined_data_df['sentiment_STD'] * num_std_dev)
        combined_data_df['sentiment_Lower_Band'] = combined_data_df[sentiment_ma_column] - (combined_data_df['sentiment_STD'] * num_std_dev)

        # Step 4: Ensure sentiment_Upper_Band is always above sentiment_Lower_Band
        mask = combined_data_df['sentiment_Upper_Band'] < combined_data_df['sentiment_Lower_Band']
        combined_data_df.loc[mask, ['sentiment_Upper_Band', 'sentiment_Lower_Band']] = combined_data_df.loc[mask, ['sentiment_Lower_Band', 'sentiment_Upper_Band']].values

        # Step 5: Calculate the price Bollinger Bands using the close price moving average ('close_ma_21') over the extended data range
        price_ma_column = 'close_ma_21'
        combined_data_df['price_STD'] = combined_data_df[price_ma_column].rolling(window=20).std()

        # Calculate Upper and Lower Bollinger Bands for the price
        combined_data_df['price_Upper_Band'] = combined_data_df[price_ma_column] + (combined_data_df['price_STD'] * num_std_dev)
        combined_data_df['price_Lower_Band'] = combined_data_df[price_ma_column] - (combined_data_df['price_STD'] * num_std_dev)

        # Step 6: Calculate sentiment divergence based on aligned data
        # This is done on the extended data, and later we'll trim the result
        sentiment_divergence = combined_data_df[['sentiment_Upper_Band', 'sentiment_Lower_Band']].sub(
            combined_data_df[['price_Upper_Band', 'price_Lower_Band']].values
        )
        sentiment_divergence['divergence'] = sentiment_divergence.abs().sum(axis=1)

        # Add sentiment divergence to the DataFrame
        combined_data_df['sentiment_divergence'] = sentiment_divergence['divergence']

        # Step 7: Calculate the adjusted overlap upper and lower bands
        combined_data_df['boll_upper_overlap_band'] = combined_data_df.apply(
            lambda row: calculate_boll_upper_advanced(
                row['price_Upper_Band'], 
                row['sentiment_Lower_Band'], 
                row['sentiment_Upper_Band'], 
                row['price_Lower_Band']
            ), axis=1
        )

        combined_data_df['boll_lower_overlap_band'] = combined_data_df.apply(
            lambda row: calculate_boll_lower_advanced(
                row['price_Lower_Band'], 
                row['sentiment_Upper_Band'], 
                row['sentiment_Lower_Band'], 
                row['price_Upper_Band']
            ), axis=1
        )
        
        # Crossovers
        # Calculate the combined normalized scores for all moving averages
        for ma in [7, 21, 50, 100, 200]:
            sentiment_ma_col = f'combined_compound_ma_{ma}'
            price_ma_col = f'close_ma_{ma}'

            combined_data_df[sentiment_ma_col].fillna(method='ffill', inplace=True)
            combined_data_df[price_ma_col].fillna(method='ffill', inplace=True)

            normalized_sentiment_col = f'normalized_sentiment_{ma}'
            normalized_price_col = f'normalized_price_{ma}'

            combined_data_df[normalized_sentiment_col] = normalize_column(combined_data_df, sentiment_ma_col)
            combined_data_df[normalized_price_col] = normalize_column(combined_data_df, price_ma_col)

            # Calculate combined normalized score using weights
            combined_data_df[f'combined_normalized_score_{ma}'] = (
                combined_data_df[normalized_sentiment_col] * 0.8 +
                combined_data_df[normalized_price_col] * 0.2
            )
        # Iterate over each moving average period and calculate crossovers dynamically
        for ma in [7, 21, 50, 100, 200]:
            # Dynamic column names
            normalized_sentiment_col = f'normalized_sentiment_{ma}'
            normalized_price_col = f'normalized_price_{ma}'
            crossover_column = f'crossover_{ma}'
            crossover_type_column = f'crossover_type_{ma}'

            # Calculate the crossover points for each moving average using normalized columns and store the 'Close' value
            combined_data_df[crossover_column] = np.where(
                (combined_data_df[normalized_sentiment_col] > combined_data_df[normalized_price_col]) & 
                (combined_data_df[normalized_sentiment_col].shift(1) <= combined_data_df[normalized_price_col].shift(1)) |
                (combined_data_df[normalized_sentiment_col] < combined_data_df[normalized_price_col]) & 
                (combined_data_df[normalized_sentiment_col].shift(1) >= combined_data_df[normalized_price_col].shift(1)), 
                combined_data_df['close'], 
                np.nan
            )

            # Define the crossover type (up or down)
            combined_data_df[crossover_type_column] = np.where(
                (combined_data_df[normalized_sentiment_col] > combined_data_df[normalized_price_col]) & 
                (combined_data_df[normalized_sentiment_col].shift(1) <= combined_data_df[normalized_price_col].shift(1)),
                'cross_up',
                np.where(
                    (combined_data_df[normalized_sentiment_col] < combined_data_df[normalized_price_col]) & 
                    (combined_data_df[normalized_sentiment_col].shift(1) >= combined_data_df[normalized_price_col].shift(1)),
                    'cross_down',
                    np.nan
                )
            )
            
        combined_data_df['prev_close_up_down'] = combined_data_df['close'].diff().apply(lambda x: 1 if x > 0 else 0)

        # Step 8: Trim the DataFrame to fit the original date window (start_date to end_date)
        combined_data_df = combined_data_df.loc[start_date:end_date]

        # Step 11: Remove duplicate columns if they exist
        combined_data_df = combined_data_df.loc[:, ~combined_data_df.columns.duplicated()]

        # Step 12: Store the final DataFrame in the dictionary
        moving_averages_dict[grp_term] = combined_data_df.copy()

# Step 13: Concatenate all DataFrames from the dictionary after processing each term
final_combined_data_df = pd.concat(moving_averages_dict.values(), axis=0)

# Optionally, reset index if you need to work with the 'date' column directly
final_combined_data_df.reset_index(inplace=True)

final_combined_data_df['day_of_week'] = final_combined_data_df['date'].dt.dayofweek
final_combined_data_df['month'] = final_combined_data_df['date'].dt.month
final_combined_data_df['day_of_month'] = final_combined_data_df['date'].dt.day
final_combined_data_df['is_weekend'] = final_combined_data_df['date'].dt.dayofweek >= 5




# Display the final DataFrame or inspect it as needed
print("Final Combined DataFrame:")
display(final_combined_data_df.tail())


Processing term: SOL


  combined_data_df['sentiment_STD'] = combined_data_df['combined_compound_ma_7'].rolling(window=20).std()
  combined_data_df['sentiment_Upper_Band'] = combined_data_df[sentiment_ma_column] + (combined_data_df['sentiment_STD'] * num_std_dev)
  combined_data_df['sentiment_Lower_Band'] = combined_data_df[sentiment_ma_column] - (combined_data_df['sentiment_STD'] * num_std_dev)


Processing term: KAS


  combined_data_df['sentiment_Upper_Band'] = combined_data_df[sentiment_ma_column] + (combined_data_df['sentiment_STD'] * num_std_dev)
  combined_data_df['sentiment_Lower_Band'] = combined_data_df[sentiment_ma_column] - (combined_data_df['sentiment_STD'] * num_std_dev)


Processing term: LINK


  combined_data_df['sentiment_STD'] = combined_data_df['combined_compound_ma_7'].rolling(window=20).std()
  combined_data_df['sentiment_Upper_Band'] = combined_data_df[sentiment_ma_column] + (combined_data_df['sentiment_STD'] * num_std_dev)
  combined_data_df['sentiment_Lower_Band'] = combined_data_df[sentiment_ma_column] - (combined_data_df['sentiment_STD'] * num_std_dev)


Processing term: ADA


  combined_data_df['sentiment_STD'] = combined_data_df['combined_compound_ma_7'].rolling(window=20).std()
  combined_data_df['sentiment_Upper_Band'] = combined_data_df[sentiment_ma_column] + (combined_data_df['sentiment_STD'] * num_std_dev)
  combined_data_df['sentiment_Lower_Band'] = combined_data_df[sentiment_ma_column] - (combined_data_df['sentiment_STD'] * num_std_dev)


Processing term: MATIC


  combined_data_df['sentiment_STD'] = combined_data_df['combined_compound_ma_7'].rolling(window=20).std()
  combined_data_df['sentiment_Upper_Band'] = combined_data_df[sentiment_ma_column] + (combined_data_df['sentiment_STD'] * num_std_dev)
  combined_data_df['sentiment_Lower_Band'] = combined_data_df[sentiment_ma_column] - (combined_data_df['sentiment_STD'] * num_std_dev)


Processing term: AVAX


  combined_data_df['sentiment_STD'] = combined_data_df['combined_compound_ma_7'].rolling(window=20).std()
  combined_data_df['sentiment_Upper_Band'] = combined_data_df[sentiment_ma_column] + (combined_data_df['sentiment_STD'] * num_std_dev)
  combined_data_df['sentiment_Lower_Band'] = combined_data_df[sentiment_ma_column] - (combined_data_df['sentiment_STD'] * num_std_dev)


Processing term: POPCAT


  combined_data_df['sentiment_STD'] = combined_data_df['combined_compound_ma_7'].rolling(window=20).std()
  combined_data_df['sentiment_Upper_Band'] = combined_data_df[sentiment_ma_column] + (combined_data_df['sentiment_STD'] * num_std_dev)
  combined_data_df['sentiment_Lower_Band'] = combined_data_df[sentiment_ma_column] - (combined_data_df['sentiment_STD'] * num_std_dev)


Processing term: SUI


  combined_data_df['sentiment_STD'] = combined_data_df['combined_compound_ma_7'].rolling(window=20).std()
  combined_data_df['sentiment_Upper_Band'] = combined_data_df[sentiment_ma_column] + (combined_data_df['sentiment_STD'] * num_std_dev)
  combined_data_df['sentiment_Lower_Band'] = combined_data_df[sentiment_ma_column] - (combined_data_df['sentiment_STD'] * num_std_dev)


Processing term: HNT


  combined_data_df['sentiment_STD'] = combined_data_df['combined_compound_ma_7'].rolling(window=20).std()
  combined_data_df['sentiment_Upper_Band'] = combined_data_df[sentiment_ma_column] + (combined_data_df['sentiment_STD'] * num_std_dev)
  combined_data_df['sentiment_Lower_Band'] = combined_data_df[sentiment_ma_column] - (combined_data_df['sentiment_STD'] * num_std_dev)


Processing term: WIF


  combined_data_df['sentiment_STD'] = combined_data_df['combined_compound_ma_7'].rolling(window=20).std()
  combined_data_df['sentiment_Upper_Band'] = combined_data_df[sentiment_ma_column] + (combined_data_df['sentiment_STD'] * num_std_dev)
  combined_data_df['sentiment_Lower_Band'] = combined_data_df[sentiment_ma_column] - (combined_data_df['sentiment_STD'] * num_std_dev)


Processing term: BTC


  combined_data_df['sentiment_STD'] = combined_data_df['combined_compound_ma_7'].rolling(window=20).std()
  combined_data_df['sentiment_Upper_Band'] = combined_data_df[sentiment_ma_column] + (combined_data_df['sentiment_STD'] * num_std_dev)
  combined_data_df['sentiment_Lower_Band'] = combined_data_df[sentiment_ma_column] - (combined_data_df['sentiment_STD'] * num_std_dev)


Processing term: DOGE


  combined_data_df['sentiment_STD'] = combined_data_df['combined_compound_ma_7'].rolling(window=20).std()
  combined_data_df['sentiment_Upper_Band'] = combined_data_df[sentiment_ma_column] + (combined_data_df['sentiment_STD'] * num_std_dev)
  combined_data_df['sentiment_Lower_Band'] = combined_data_df[sentiment_ma_column] - (combined_data_df['sentiment_STD'] * num_std_dev)


Processing term: ETH


  combined_data_df['sentiment_STD'] = combined_data_df['combined_compound_ma_7'].rolling(window=20).std()
  combined_data_df['sentiment_Upper_Band'] = combined_data_df[sentiment_ma_column] + (combined_data_df['sentiment_STD'] * num_std_dev)
  combined_data_df['sentiment_Lower_Band'] = combined_data_df[sentiment_ma_column] - (combined_data_df['sentiment_STD'] * num_std_dev)


Processing term: GME


  combined_data_df['sentiment_Upper_Band'] = combined_data_df[sentiment_ma_column] + (combined_data_df['sentiment_STD'] * num_std_dev)
  combined_data_df['sentiment_Lower_Band'] = combined_data_df[sentiment_ma_column] - (combined_data_df['sentiment_STD'] * num_std_dev)


Processing term: NVDA


  combined_data_df['sentiment_Upper_Band'] = combined_data_df[sentiment_ma_column] + (combined_data_df['sentiment_STD'] * num_std_dev)
  combined_data_df['sentiment_Lower_Band'] = combined_data_df[sentiment_ma_column] - (combined_data_df['sentiment_STD'] * num_std_dev)


Processing term: JPM


  combined_data_df['sentiment_Upper_Band'] = combined_data_df[sentiment_ma_column] + (combined_data_df['sentiment_STD'] * num_std_dev)
  combined_data_df['sentiment_Lower_Band'] = combined_data_df[sentiment_ma_column] - (combined_data_df['sentiment_STD'] * num_std_dev)


Processing term: GOOGL


  combined_data_df['sentiment_Upper_Band'] = combined_data_df[sentiment_ma_column] + (combined_data_df['sentiment_STD'] * num_std_dev)
  combined_data_df['sentiment_Lower_Band'] = combined_data_df[sentiment_ma_column] - (combined_data_df['sentiment_STD'] * num_std_dev)


Processing term: DXY
Processing term: TSMC
Processing term: CVX


  combined_data_df['sentiment_Upper_Band'] = combined_data_df[sentiment_ma_column] + (combined_data_df['sentiment_STD'] * num_std_dev)
  combined_data_df['sentiment_Lower_Band'] = combined_data_df[sentiment_ma_column] - (combined_data_df['sentiment_STD'] * num_std_dev)


Processing term: COIN


  combined_data_df['sentiment_Upper_Band'] = combined_data_df[sentiment_ma_column] + (combined_data_df['sentiment_STD'] * num_std_dev)
  combined_data_df['sentiment_Lower_Band'] = combined_data_df[sentiment_ma_column] - (combined_data_df['sentiment_STD'] * num_std_dev)


Processing term: AMZN


  combined_data_df['sentiment_Upper_Band'] = combined_data_df[sentiment_ma_column] + (combined_data_df['sentiment_STD'] * num_std_dev)
  combined_data_df['sentiment_Lower_Band'] = combined_data_df[sentiment_ma_column] - (combined_data_df['sentiment_STD'] * num_std_dev)


Processing term: MSFT


  combined_data_df['sentiment_Upper_Band'] = combined_data_df[sentiment_ma_column] + (combined_data_df['sentiment_STD'] * num_std_dev)
  combined_data_df['sentiment_Lower_Band'] = combined_data_df[sentiment_ma_column] - (combined_data_df['sentiment_STD'] * num_std_dev)


Processing term: NFLX


  combined_data_df['sentiment_Upper_Band'] = combined_data_df[sentiment_ma_column] + (combined_data_df['sentiment_STD'] * num_std_dev)
  combined_data_df['sentiment_Lower_Band'] = combined_data_df[sentiment_ma_column] - (combined_data_df['sentiment_STD'] * num_std_dev)


Processing term: DIS


  combined_data_df['sentiment_Upper_Band'] = combined_data_df[sentiment_ma_column] + (combined_data_df['sentiment_STD'] * num_std_dev)
  combined_data_df['sentiment_Lower_Band'] = combined_data_df[sentiment_ma_column] - (combined_data_df['sentiment_STD'] * num_std_dev)


Processing term: AAPL


  combined_data_df['sentiment_Upper_Band'] = combined_data_df[sentiment_ma_column] + (combined_data_df['sentiment_STD'] * num_std_dev)
  combined_data_df['sentiment_Lower_Band'] = combined_data_df[sentiment_ma_column] - (combined_data_df['sentiment_STD'] * num_std_dev)


Processing term: TSLA


  combined_data_df['sentiment_Upper_Band'] = combined_data_df[sentiment_ma_column] + (combined_data_df['sentiment_STD'] * num_std_dev)
  combined_data_df['sentiment_Lower_Band'] = combined_data_df[sentiment_ma_column] - (combined_data_df['sentiment_STD'] * num_std_dev)


Final Combined DataFrame:


  final_combined_data_df.reset_index(inplace=True)
  final_combined_data_df['day_of_week'] = final_combined_data_df['date'].dt.dayofweek
  final_combined_data_df['month'] = final_combined_data_df['date'].dt.month
  final_combined_data_df['day_of_month'] = final_combined_data_df['date'].dt.day
  final_combined_data_df['is_weekend'] = final_combined_data_df['date'].dt.dayofweek >= 5


Unnamed: 0,date,term,combined_compound_ma_7,combined_compound_ma_21,combined_compound_ma_50,combined_compound_ma_100,combined_compound_ma_200,combined_compound,open,high,...,crossover_type_50,crossover_100,crossover_type_100,crossover_200,crossover_type_200,prev_close_up_down,day_of_week,month,day_of_month,is_weekend
5301,2025-06-03,TSLA,0.161703,0.131179,0.103489,0.076439,0.054156,0.170356,346.595,355.4,...,,,,,,1,1,6,3,False
5302,2025-06-04,TSLA,0.143425,0.127307,0.102905,0.07668,0.054499,0.088593,345.095,345.6,...,,,,,,0,2,6,4,False
5303,2025-06-05,TSLA,0.108588,0.116105,0.09903,0.075242,0.053997,0.004078,322.49,324.5499,...,,,,,,0,3,6,5,False
5304,2025-06-06,TSLA,0.078489,0.104476,0.094683,0.073518,0.053342,-0.01181,298.83,305.5,...,,,,,,1,4,6,6,False
5305,2025-06-07,TSLA,0.06167,0.095998,0.09141,0.072285,0.052923,0.011215,,,...,,,,,,0,5,6,7,True


In [3]:
# ✅ PostgreSQL connection parameters
db_params = {
    'dbname': 'twt_snt',
    'user': 'postgres',
    'password': 'Ilpmnl!69gg',
    'host': 'localhost',
    'port': '5432'
}

#_______snt_ma_blend_detail_tbl___________________

# ✅ Enhanced SQL query with multiple aggregations
query = """
SELECT 
    term,
    AVG(twitter_weight) AS avg_twitter_weight,
    MIN(twitter_weight) AS min_twitter_weight,
    MAX(twitter_weight) AS max_twitter_weight,
    STDDEV(twitter_weight) AS std_twitter_weight,

    AVG(reddit_weight) AS avg_reddit_weight,
    MIN(reddit_weight) AS min_reddit_weight,
    MAX(reddit_weight) AS max_reddit_weight,
    STDDEV(reddit_weight) AS std_reddit_weight,

    AVG(news_weight) AS avg_news_weight,
    MIN(news_weight) AS min_news_weight,
    MAX(news_weight) AS max_news_weight,
    STDDEV(news_weight) AS std_news_weight,

    SUM(correlation * (twitter_weight + reddit_weight + news_weight)) / NULLIF(SUM(twitter_weight + reddit_weight + news_weight), 0) AS weighted_correlation,
    MIN(correlation) AS min_correlation,
    MAX(correlation) AS max_correlation,
    STDDEV(correlation) AS std_correlation,

    AVG(p_value) AS avg_p_value,
    MIN(p_value) AS min_p_value,
    MAX(p_value) AS max_p_value,
    PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY p_value) AS median_p_value,

    AVG(confidence_interval_lower) AS avg_ci_lower,
    MIN(confidence_interval_lower) AS min_ci_lower,
    MAX(confidence_interval_lower) AS max_ci_lower,
    PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY confidence_interval_lower) AS median_ci_lower,

    AVG(confidence_interval_upper) AS avg_ci_upper,
    MIN(confidence_interval_upper) AS min_ci_upper,
    MAX(confidence_interval_upper) AS max_ci_upper,
    PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY confidence_interval_upper) AS median_ci_upper,

    MAX(run_date) AS latest_run_date
FROM 
    snt_ma_blend_detail_tbl
GROUP BY 
    term
"""

# ✅ Connect to the database and fetch data
conn = psycopg2.connect(**db_params)
df_blend_data = pd.read_sql_query(query, conn)
conn.close()

print("✅ Fetched and aggregated data successfully!")



# ✅ Merge aggregated blend data into final_combined_data_df
final_combined_data_df = pd.merge(final_combined_data_df, df_blend_data, on="term", how="left")

print("✅ Successfully merged aggregated blend data!")


#_______signal_cnt_tbl___________________

# ✅ SQL query to fetch signal count data
query_signals = """
SELECT 
    term,
    date,
    buy,
    sell,
    neutral
FROM 
    signal_cnt_tbl
"""

# ✅ Connect to the database and fetch signal data
conn = psycopg2.connect(**db_params)
df_signal_data = pd.read_sql_query(query_signals, conn)
conn.close()

print("✅ Fetched signal count data successfully!")

# ✅ Convert 'date' in final_combined_data_df to datetime
final_combined_data_df["date"] = pd.to_datetime(final_combined_data_df["date"])

# ✅ Convert 'date' in df_signal_data to datetime
df_signal_data["date"] = pd.to_datetime(df_signal_data["date"])

# ✅ Merge signal count data into final_combined_data_df on 'date' and 'term'
final_combined_data_df = pd.merge(
    final_combined_data_df, 
    df_signal_data, 
    on=["date", "term"], 
    how="left"
)

print("✅ Successfully merged signal count data with final_combined_data_df!")







  df_blend_data = pd.read_sql_query(query, conn)
  df_signal_data = pd.read_sql_query(query_signals, conn)


✅ Fetched and aggregated data successfully!
✅ Successfully merged aggregated blend data!
✅ Fetched signal count data successfully!
✅ Successfully merged signal count data with final_combined_data_df!


In [4]:
# ==============================
# ✅ Required Imports
# ==============================
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine
import numpy as np
import psycopg2

# ==============================
# ✅ Database Connection Setup
# ==============================
# Use SQLAlchemy for cleaner connection management
DB_URI = "postgresql+psycopg2://postgres:Ilpmnl!69gg@localhost:5432/twt_snt"
engine = create_engine(DB_URI)

print("✅ Successfully connected to the database using SQLAlchemy!")

# ==============================
# ✅ Fetch Data from Both Tables
# ==============================

# ✅ SQL query to fetch detail correlation data
query_detail = """
SELECT 
    date,
    term,
    ma,
    lag,
    correlation,
    window_length,
    run_date
FROM 
    snt_ma_correlation_detail_tbl
"""

# ✅ SQL query to fetch summary correlation data
query_summary = """
SELECT 
    date,
    term,
    ma,
    best_lag,
    max_correlation,
    leading_indicator_score,
    window_length,
    run_date
FROM 
    snt_ma_correlation_summary_tbl
"""

# ✅ Fetch data using SQLAlchemy
df_detail_data = pd.read_sql_query(query_detail, engine)
df_summary_data = pd.read_sql_query(query_summary, engine)

print("✅ Fetched detail and summary correlation data successfully!")

# ==============================
# ✅ Data Preprocessing
# ==============================

# ✅ Convert 'date' columns to datetime
df_detail_data["date"] = pd.to_datetime(df_detail_data["date"])
df_summary_data["date"] = pd.to_datetime(df_summary_data["date"])

# ✅ Check for NaN and drop missing 'ma' if necessary
df_detail_data.dropna(subset=["ma"], inplace=True)
df_summary_data.dropna(subset=["ma"], inplace=True)

# ==============================
# ✅ Merge Detail and Summary Data
# ==============================

# ✅ Merge detail and summary data on 'date', 'term', and 'ma'
merged_df = pd.merge(
    df_detail_data,
    df_summary_data,
    on=["date", "term", "ma"],
    how="left",
    suffixes=("_detail", "_summary")
)

print("✅ Successfully merged correlation detail and summary data!")

# ==============================
# ✅ Create Meta-Features
# ==============================

# ✅ Lag Difference: Compare the best lag vs. lags in the detail data.
merged_df["lag_diff"] = merged_df["lag"] - merged_df["best_lag"]

# ✅ Correlation Ratio: Ratio between current lag correlation and best correlation.   
merged_df["correlation_ratio"] = merged_df["correlation"] / (merged_df["max_correlation"] + 1e-6)

# ✅ Correlation Difference: Difference between the current lag correlation and best correlation.    
merged_df["correlation_diff"] = merged_df["correlation"] - merged_df["max_correlation"]

# ✅ Relative Correlation to Window Length: How correlation changes as window length increases.
merged_df["correlation_per_window"] = merged_df["correlation"] / merged_df["window_length_detail"]

# ✅ Lag Abs Diff as Indicator: Magnitude of lag difference as a proxy for how far sentiment leads or lags.
merged_df["abs_lag_diff"] = merged_df["lag_diff"].abs()

print("✅ Meta-features created successfully!")

# ==============================
# ✅ Prepare Final Combined Data for Merge
# ==============================

# ✅ Melt/Unpivot final_combined_data_df to create 'ma' column
final_combined_data_df_melted = pd.melt(
    final_combined_data_df,
    id_vars=["date", "term"],
    value_vars=[
        "combined_compound_ma_7",
        "combined_compound_ma_21",
        "combined_compound_ma_50",
        "combined_compound_ma_100",
        "combined_compound_ma_200"
    ],
    var_name="ma",
    value_name="combined_compound"
)

# ✅ Extract only the MA number from 'ma' column
final_combined_data_df_melted["ma"] = final_combined_data_df_melted["ma"].str.replace(
    "combined_compound_ma_", "").astype(int)

print("✅ Successfully melted final_combined_data_df to prepare for merging!")

# ==============================
# ✅ Merge Meta-Features into Final Combined Data
# ==============================

# ✅ Merge melted data with meta-features
final_combined_data_df = pd.merge(
    final_combined_data_df_melted,
    merged_df,
    on=["date", "term", "ma"],
    how="left"
)

print("✅ Successfully merged meta-features with final_combined_data_df!")

# ==============================
# ✅ Pivot Back to Original Format (Optional if Needed)
# ==============================

# ✅ Pivot the data back if you want the original format after merging
final_combined_data_df_pivoted = final_combined_data_df.pivot_table(
    index=["date", "term"],
    columns="ma",
    values=[
        "combined_compound",
        "lag_diff",
        "correlation_ratio",
        "correlation_diff",
        "correlation_per_window",
        "abs_lag_diff"
    ]
)

# ✅ Flatten MultiIndex columns
final_combined_data_df_pivoted.columns = [f"{col[0]}_ma_{int(col[1])}" for col in final_combined_data_df_pivoted.columns]
final_combined_data_df_pivoted.reset_index(inplace=True)

print("✅ Successfully pivoted the data back to original format!")

# ==============================
# 🔍 Optional: Debug Merge Status
# ==============================

# ✅ Check for merge issues if necessary
debug_df = pd.merge(
    final_combined_data_df_melted,
    merged_df,
    on=["date", "term", "ma"],
    how="left",
    indicator=True
)
print(debug_df["_merge"].value_counts())

# ==============================
# 🎉 Final Outcome: Enriched DataFrame Ready for Model Training
# ==============================
print("✅ Final Combined DataFrame is ready with enriched meta-features!")


✅ Successfully connected to the database using SQLAlchemy!
✅ Fetched detail and summary correlation data successfully!
✅ Successfully merged correlation detail and summary data!
✅ Meta-features created successfully!
✅ Successfully melted final_combined_data_df to prepare for merging!


  final_combined_data_df_melted = pd.melt(


✅ Successfully merged meta-features with final_combined_data_df!
✅ Successfully pivoted the data back to original format!
both          44700
left_only     23550
right_only        0
Name: _merge, dtype: int64
✅ Final Combined DataFrame is ready with enriched meta-features!


In [5]:
import datetime
import os

from datetime import timedelta
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBClassifier 
from lightgbm import LGBMClassifier 
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier


import pandas as pd
import numpy as np
import datetime
from datetime import timedelta
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier, GradientBoostingClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [6]:
print(final_combined_data_df.head())


        date term  ma  combined_compound  lag  correlation  \
0 2024-11-19  SOL   7           0.191733  NaN          NaN   
1 2024-11-20  SOL   7           0.226319  NaN          NaN   
2 2024-11-21  SOL   7           0.240264  NaN          NaN   
3 2024-11-22  SOL   7           0.243445  NaN          NaN   
4 2024-11-23  SOL   7           0.267342  NaN          NaN   

   window_length_detail run_date_detail  best_lag  max_correlation  \
0                   NaN             NaT       NaN              NaN   
1                   NaN             NaT       NaN              NaN   
2                   NaN             NaT       NaN              NaN   
3                   NaN             NaT       NaN              NaN   
4                   NaN             NaT       NaN              NaN   

   leading_indicator_score  window_length_summary run_date_summary  lag_diff  \
0                      NaN                    NaN              NaT       NaN   
1                      NaN                    

3. Predict

In [7]:

# ====================== PARAMETERS ======================

# Date range
end_date = datetime.datetime.today().strftime('%Y-%m-%d')
start_date = (datetime.datetime.today() - timedelta(days=200)).strftime('%Y-%m-%d')
extended_start_date = (pd.to_datetime(start_date) - pd.Timedelta(days=21)).strftime('%Y-%m-%d')

# Terms
grp_terms = ['SOL', 'KAS', 'LINK', 'ADA', 'MATIC', 'AVAX', 'POPCAT', 'SUI', 'HNT', 'WIF', 'BTC', 'DOGE', 'ETH', 
             'GME', 'NVDA', 'JPM', 'GOOGL', 'DXY', 'TSMC', 'CVX', 'COIN', 'AMZN', 'MSFT', 'NFLX', 'DIS', 'AAPL', 'TSLA']

# ============== PREPROCESS MOVING AVERAGE DICTIONARY ==============

if 'moving_averages_dict' not in globals() or not moving_averages_dict:
    raise ValueError("`moving_averages_dict` is not defined or empty.")

cleaned_dict = {}
for grp_term, df in moving_averages_dict.items():
    print(f"Processing: {grp_term}")

    df = df.reset_index()  # Make 'date' a column
    if 'prev_close_up_down' not in df.columns or 'date' not in df.columns:
        print(f"Skipping {grp_term} due to missing columns.")
        continue


    df = df.copy()
    df['next_close'] = df['prev_close_up_down'].shift(-1)
    df = df[:-1]
    df['term'] = grp_term
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df['day_of_week'] = df['date'].dt.dayofweek
    df['month'] = df['date'].dt.month
    df['day_of_month'] = df['date'].dt.day
    df['is_weekend'] = df['day_of_week'] >= 5
    cleaned_dict[grp_term] = df

# Combine all terms
final_combined_data_df = pd.concat(cleaned_dict.values(), ignore_index=True)

# Drop rows with NaN targets
final_combined_data_df.dropna(subset=['next_close'], inplace=True)

# ======================== FEATURES ========================

target = 'next_close'
exclude_cols = ['date', 'prev_close_up_down', target, 'term']
feature_columns = [col for col in final_combined_data_df.columns if col not in exclude_cols]

X = final_combined_data_df[feature_columns]
y = final_combined_data_df[target]

# Separate numeric & categorical
numeric_columns = X.select_dtypes(include=['number']).columns.tolist()
categorical_columns = X.select_dtypes(exclude=['number']).columns.tolist()

# Clean numeric & categorical
for col in numeric_columns:
    X.loc[:, col] = pd.to_numeric(X[col], errors='coerce')
X[numeric_columns] = X[numeric_columns].replace([np.inf, -np.inf], np.nan)
X[numeric_columns] = X[numeric_columns].clip(lower=-1e10, upper=1e10)

for col in categorical_columns:
    X.loc[:, col] = X[col].astype(str).fillna("UNKNOWN")

# ===================== PREPROCESSOR =====================

transformers = [('num', Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),
                                        ('scaler', StandardScaler())]), numeric_columns)]

if categorical_columns:
    transformers.append(('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns))

preprocessor = ColumnTransformer(transformers)

# ===================== CLASSIFIERS =====================

gb_classifier = GradientBoostingClassifier(n_estimators=150, learning_rate=0.1, max_depth=5, random_state=42)

catboost_classifier = CatBoostClassifier(iterations=140, learning_rate=0.15, depth=4, random_state=42, verbose=0)

xgb_classifier = XGBClassifier(
    n_estimators=150,
    learning_rate=0.05,
    max_depth=7,
    subsample=1.0,
    colsample_bytree=0.7,
    gamma=5,
    reg_alpha=0,
    reg_lambda=5,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

pipeline_gb = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', gb_classifier)])
pipeline_catboost = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', catboost_classifier)])
pipeline_xgb = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', xgb_classifier)])


# Logistic Regression with best hyperparameters
lr_classifier = LogisticRegression(
    C=5.623413251903491,
    class_weight=None,
    max_iter=130,
    penalty='l1',
    solver='liblinear',
    random_state=42
)

pipeline_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', lr_classifier)
])

# Random Forest Pipeline
rf_classifier = RandomForestClassifier(n_estimators=30, max_depth=34, min_samples_leaf=2, min_samples_split=25, random_state=42)
pipeline_rf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', rf_classifier)])

# Gradient Boosting (Optional with Tweaked Parameters)
gb_classifier_v2 = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42)
pipeline_gb_v2 = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', gb_classifier_v2)])

pipeline_svm = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(
        C=1.0,
        kernel='rbf',
        gamma='auto',
        probability=True,
        random_state=42
    ))
])

pipeline_lgbm = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier(
        colsample_bytree=0.6,
        learning_rate=0.09,
        max_depth=-1,
        n_estimators=220,
        num_leaves=24,
        reg_alpha=0.2,
        reg_lambda=0.1,
        subsample=0.60,
        random_state=42
    ))
])

pipeline_et = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', ExtraTreesClassifier(
        n_estimators=120,
        max_depth=None,
        max_features=0.5,
        min_samples_split=4,
        min_samples_leaf=2,
        random_state=42
    ))
])

# ===================== TRAIN-TEST SPLIT =====================

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# First ensemble
ensemble_1 = VotingClassifier(
    estimators=[
        ('randomforest', pipeline_rf),
        ('svm', pipeline_svm),
        ('lightgbm', pipeline_lgbm),
        ('extratrees', pipeline_et)
    ],
    voting='soft'
)

# Second ensemble
ensemble_2 = VotingClassifier(
    estimators=[
        ('gradient_boosting', pipeline_gb),
        ('logistic_regression', pipeline_lr),
        ('xgboost', pipeline_xgb)
    ],
    voting='soft'
)
print("Training First-Level Ensembles...")
with tqdm(total=2) as pbar:
    ensemble_1.fit(X_train, y_train)
    pbar.update(1)
    ensemble_2.fit(X_train, y_train)
    pbar.update(1)
    


# Meta-Ensemble
meta_ensemble = VotingClassifier(estimators=[('ensemble_1', ensemble_1), ('ensemble_2', ensemble_2)], voting='soft')
print("Training Meta-Ensemble...")
meta_ensemble.fit(X_train, y_train)

# Evaluation
y_pred_meta = meta_ensemble.predict(X_test)
print(f'Meta-Ensemble Accuracy: {accuracy_score(y_test, y_pred_meta):.2f}')
print("\nClassification Report:\n", classification_report(y_test, y_pred_meta))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_meta))

# ===================== NEXT DAY PREDICTIONS =====================

print("\nPredictions for next day per term:")

prediction_csv = "predictions_log.csv"
prediction_log_df = pd.DataFrame(columns=['date', 'term', 'prediction', 'actual'])

terms = final_combined_data_df['term'].unique()

for term in terms:
    term_data = final_combined_data_df[final_combined_data_df['term'] == term]
    if term_data.empty:
        print(f"No data found for term '{term}', skipping.")
        continue

    latest_data = term_data.iloc[-1].copy()
    tomorrow_date = latest_data['date'] + datetime.timedelta(days=1)
    tomorrow_str = tomorrow_date.strftime('%Y-%m-%d')

    # Update date-based features
    latest_data['date'] = tomorrow_date
    latest_data['day_of_week'] = tomorrow_date.dayofweek
    latest_data['month'] = tomorrow_date.month
    latest_data['day_of_month'] = tomorrow_date.day
    latest_data['is_weekend'] = tomorrow_date.dayofweek >= 5

    # Prepare feature DataFrame
    tomorrow_df = pd.DataFrame([latest_data[feature_columns]])

    # Force numeric columns back to numeric safely
    for col in numeric_columns:
        tomorrow_df[col] = pd.to_numeric(tomorrow_df[col], errors='coerce')
    tomorrow_df[numeric_columns] = tomorrow_df[numeric_columns].replace([np.inf, -np.inf], np.nan)
    tomorrow_df[numeric_columns] = tomorrow_df[numeric_columns].fillna(tomorrow_df[numeric_columns].mean())

    # Fill categoricals
    for col in categorical_columns:
        tomorrow_df[col] = tomorrow_df[col].fillna("UNKNOWN").astype(str)

    # Predict & log
    try:
        probs = meta_ensemble.predict_proba(tomorrow_df)[0]
        prediction_value = int(probs[1] > 0.5)  # still binarizing to 0/1
        confidence = round(probs[1], 4)  # confidence for class 1 (UP)

        prediction_record = {
            'date': tomorrow_str,
            'term': term,
            'prediction': prediction_value,
            'confidence': confidence,
            'actual': latest_data['prev_close_up_down']
        }
        prediction_log_df = pd.concat([prediction_log_df, pd.DataFrame([prediction_record])], ignore_index=True)

        direction = "UP (1)" if prediction_value == 1 else "DOWN (0)"
        print(f"Prediction for {tomorrow_str} for term '{term}' `next_close`: {direction}")

    except Exception as e:
        print(f"Error predicting for term '{term}' on {tomorrow_str}: {e}")
        
# Append previous logs if file exists
# Load existing log if available
if os.path.exists(prediction_csv):
    existing_predictions = pd.read_csv(prediction_csv)
    
    # Ensure consistent data types
    existing_predictions['date'] = pd.to_datetime(existing_predictions['date'])
    prediction_log_df['date'] = pd.to_datetime(prediction_log_df['date'])
    
    # Merge logic: keep historical data but update today's predictions
    today = datetime.datetime.today().strftime('%Y-%m-%d')
    
    # Remove any existing predictions for today's date
    existing_predictions = existing_predictions[
        existing_predictions['date'].dt.strftime('%Y-%m-%d') != today
    ]
    
    # Combine with new predictions
    prediction_log_df = pd.concat([existing_predictions, prediction_log_df], ignore_index=True)
    
    # Remove duplicates (keeping last entry for each date+term)
    prediction_log_df = prediction_log_df.drop_duplicates(
        subset=['date', 'term'], 
        keep='last'
    )

# Final sort and write
prediction_log_df.sort_values(by=['date', 'term'], inplace=True)
prediction_log_df.to_csv(prediction_csv, index=False)
print("✅ Predictions updated with overwrite protection and saved to log file.")




Processing: SOL
Processing: KAS
Processing: LINK
Processing: ADA
Processing: MATIC
Processing: AVAX
Processing: POPCAT
Processing: SUI
Processing: HNT
Processing: WIF
Processing: BTC
Processing: DOGE
Processing: ETH
Processing: GME
Processing: NVDA
Processing: JPM
Processing: GOOGL
Processing: DXY
Processing: TSMC
Processing: CVX
Processing: COIN
Processing: AMZN
Processing: MSFT
Processing: NFLX
Processing: DIS
Processing: AAPL
Processing: TSLA


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, col] = pd.to_numeric(X[col], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numeric_columns] = X[numeric_columns].replace([np.inf, -np.inf], np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numeric_columns] = X[numeric_columns].clip(lower=-1e10, upper=1e10)


Training First-Level Ensembles...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, col] = X[col].astype(str).fillna("UNKNOWN")


  0%|          | 0/2 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 1560, number of negative: 2663
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020528 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30208
[LightGBM] [Info] Number of data points in the train set: 4223, number of used features: 207
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.369406 -> initscore=-0.534767
[LightGBM] [Info] Start training from score -0.534767


Parameters: { "use_label_encoder" } are not used.



Training Meta-Ensemble...
[LightGBM] [Info] Number of positive: 1560, number of negative: 2663
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012688 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30208
[LightGBM] [Info] Number of data points in the train set: 4223, number of used features: 207
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.369406 -> initscore=-0.534767
[LightGBM] [Info] Start training from score -0.534767


Parameters: { "use_label_encoder" } are not used.



Meta-Ensemble Accuracy: 0.71

Classification Report:
               precision    recall  f1-score   support

         0.0       0.73      0.83      0.78       663
         1.0       0.64      0.49      0.55       393

    accuracy                           0.71      1056
   macro avg       0.68      0.66      0.67      1056
weighted avg       0.70      0.71      0.70      1056


Confusion Matrix:
 [[553 110]
 [201 192]]

Predictions for next day per term:
Prediction for 2025-06-08 for term 'SOL' `next_close`: DOWN (0)
Prediction for 2025-06-08 for term 'KAS' `next_close`: DOWN (0)
Prediction for 2025-06-08 for term 'LINK' `next_close`: DOWN (0)
Prediction for 2025-06-08 for term 'ADA' `next_close`: DOWN (0)
Prediction for 2025-06-08 for term 'MATIC' `next_close`: DOWN (0)
Prediction for 2025-06-08 for term 'AVAX' `next_close`: DOWN (0)
Prediction for 2025-06-08 for term 'POPCAT' `next_close`: DOWN (0)
Prediction for 2025-06-08 for term 'SUI' `next_close`: DOWN (0)
Prediction for 2025-0

In [8]:
prediction_log_df.to_csv(prediction_csv, index=False)
print("Predictions completed and appended to the file.")

Predictions completed and appended to the file.


In [9]:
final_combined_data_df

Unnamed: 0,date,term,combined_compound_ma_7,combined_compound_ma_21,combined_compound_ma_50,combined_compound_ma_100,combined_compound_ma_200,combined_compound,open,high,...,crossover_100,crossover_type_100,crossover_200,crossover_type_200,prev_close_up_down,next_close,day_of_week,month,day_of_month,is_weekend
0,2024-11-19,SOL,0.191733,0.227897,0.238858,0.242456,0.244130,0.184109,239.925942,247.702181,...,,,,,0,0.0,1,11,19,False
1,2024-11-20,SOL,0.226319,0.237186,0.242435,0.244191,0.244985,0.330077,237.924820,242.665362,...,,,,,0,1.0,2,11,20,False
2,2024-11-21,SOL,0.240264,0.241269,0.243990,0.244941,0.245354,0.282099,235.659449,259.854713,...,,,,,1,1.0,3,11,21,False
3,2024-11-22,SOL,0.243445,0.242334,0.244343,0.245101,0.245430,0.252988,256.639387,264.572046,...,,,,,1,0.0,4,11,22,False
4,2024-11-23,SOL,0.267342,0.251125,0.248056,0.246961,0.246361,0.339033,257.182699,264.206361,...,,,,,0,0.0,5,11,23,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5274,2025-06-02,TSLA,0.158819,0.127261,0.100760,0.074542,0.052988,0.212105,343.500000,348.020000,...,,,,,0,1.0,0,6,2,False
5275,2025-06-03,TSLA,0.161703,0.131179,0.103489,0.076439,0.054156,0.170356,346.595000,355.400000,...,,,,,1,0.0,1,6,3,False
5276,2025-06-04,TSLA,0.143425,0.127307,0.102905,0.076680,0.054499,0.088593,345.095000,345.600000,...,,,,,0,0.0,2,6,4,False
5277,2025-06-05,TSLA,0.108588,0.116105,0.099030,0.075242,0.053997,0.004078,322.490000,324.549900,...,,,,,0,1.0,3,6,5,False


In [10]:
print("Most recent tweet in dataset:", final_combined_data_df["date"].max())

Most recent tweet in dataset: 2025-06-07 00:00:00


In [11]:
import psycopg2
import pandas as pd
from datetime import datetime

# Database connection parameters
db_params = {
    'dbname': 'twt_snt',
    'user': 'postgres',
    'password': 'Ilpmnl!69gg',
    'host': 'localhost',
    'port': '5432'
}

# Example DataFrame
# predictions_df = pd.DataFrame({
#     'date': [pd.to_datetime('2024-12-20'), pd.to_datetime('2024-12-21')],
#     'term': ['term1', 'term2'],
#     'prediction': [1, 0],
#     'actual': [1, None]
# })

def safe_value(value):
    # Handle None, NaN, and Timestamp conversion
    if pd.isnull(value):
        return None
    if isinstance(value, pd.Timestamp):
        return value.date()  # Convert to a date object
    return value

# Create a connection to the database
conn = psycopg2.connect(**db_params)
cursor = conn.cursor()

# Define the insert query with ON CONFLICT DO NOTHING
insert_query = """
    INSERT INTO predictions_tbl (term, prediction_date, prediction, actual)
    VALUES (%s, %s, %s, %s)
    ON CONFLICT (term, prediction_date) DO NOTHING
"""

batch_size = 40
batch_values = []

try:
    # If necessary, ensure 'date' is in datetime.date format
    # predictions_df['date'] = pd.to_datetime(predictions_df['date']).dt.date

    for index, row in prediction_log_df.iterrows():
        # Prepare values for insertion
        values = (
            safe_value(row['term']),
            safe_value(row['date']),
            safe_value(row['prediction']),
            safe_value(row['actual'])
        )

        batch_values.append(values)

        # Commit in batches
        if len(batch_values) >= batch_size:
            try:
                cursor.executemany(insert_query, batch_values)
                conn.commit()
                print(f"Inserted {len(batch_values)} rows at {datetime.now()}")
                batch_values = []  # Clear the batch list
            except Exception as e:
                print(f"Error inserting batch: {e}")
                conn.rollback()

    # Insert remaining rows
    if batch_values:
        try:
            cursor.executemany(insert_query, batch_values)
            conn.commit()
            print(f"Inserted final batch of {len(batch_values)} rows at {datetime.now()}")
        except Exception as e:
            print(f"Error inserting final batch: {e}")
            conn.rollback()

except Exception as e:
    print(f"Error: {e}")

finally:
    # Close the cursor and connection
    cursor.close()
    conn.close()

Inserted 40 rows at 2025-06-07 20:10:16.779841
Inserted 40 rows at 2025-06-07 20:10:16.795815
Inserted 40 rows at 2025-06-07 20:10:16.814276
Inserted 40 rows at 2025-06-07 20:10:16.833521
Inserted 40 rows at 2025-06-07 20:10:16.851701
Inserted 40 rows at 2025-06-07 20:10:16.869270
Inserted 40 rows at 2025-06-07 20:10:16.887738
Inserted 40 rows at 2025-06-07 20:10:16.905885
Inserted 40 rows at 2025-06-07 20:10:16.923812
Inserted 40 rows at 2025-06-07 20:10:16.940854
Inserted 40 rows at 2025-06-07 20:10:16.957155
Inserted 40 rows at 2025-06-07 20:10:16.973758
Inserted 40 rows at 2025-06-07 20:10:16.990079
Inserted 40 rows at 2025-06-07 20:10:17.007587
Inserted 40 rows at 2025-06-07 20:10:17.023973
Inserted 40 rows at 2025-06-07 20:10:17.041912
Inserted 40 rows at 2025-06-07 20:10:17.060622
Inserted 40 rows at 2025-06-07 20:10:17.077446
Inserted 40 rows at 2025-06-07 20:10:17.093786
Inserted 40 rows at 2025-06-07 20:10:17.111009
Inserted 40 rows at 2025-06-07 20:10:17.127677
Inserted 40 r

### Backfill Missing - not currenty working

In [12]:
import pandas as pd

# Load predictions log and enriched features
df_log = pd.read_csv("predictions_log.csv", parse_dates=["date"])
features_df = pd.read_csv("final_combined_data.csv", parse_dates=["date"])

# Identify missing predictions or confidence
missing_mask = df_log["prediction"].isna() | df_log["confidence"].isna()
df_missing = df_log[missing_mask].copy()

if df_missing.empty:
    print("✅ No missing predictions or confidence values found.")
else:
    print(f"🔁 Found {len(df_missing)} rows to backfill...")

    # Merge to get full features
    df_backfill = pd.merge(df_missing[["date", "term"]], features_df, on=["date", "term"], how="left")

    # Add date-based features (defragmented)
    time_features = pd.DataFrame({
        "day_of_week": df_backfill["date"].dt.dayofweek,
        "month": df_backfill["date"].dt.month,
        "day_of_month": df_backfill["date"].dt.day,
        "is_weekend": df_backfill["date"].dt.dayofweek >= 5
    })
    df_backfill = pd.concat([df_backfill.reset_index(drop=True), time_features], axis=1)

    # Define model input columns
    model_features = features_df.columns.difference(["date", "term"]).tolist()
    model_features += ["day_of_week", "month", "day_of_month", "is_weekend"]

    # Rebuild feature matrix
    X_missing = df_backfill[model_features].copy()

    # === CLEANING STEP THAT FIXES YOUR ISSUE ===
    # Fill missing object (categorical) columns with string 'missing'
    for col in X_missing.select_dtypes(include="object").columns:
        X_missing[col] = X_missing[col].fillna("missing").astype(str)

    # Fill numeric columns with 0 (or another imputation logic)
    for col in X_missing.select_dtypes(include=["number"]).columns:
        X_missing[col] = X_missing[col].fillna(0)

    # Predict
    probs = meta_ensemble.predict_proba(X_missing)[:, 1]
    preds = (probs >= 0.5).astype(int)

    # Backfill
    df_log.loc[missing_mask, "confidence"] = probs
    df_log.loc[missing_mask, "prediction"] = preds

    # Save
    df_log.to_csv("predictions_log_backfilled.csv", index=False)
    print("✅ Backfill complete and saved to predictions_log_backfilled.csv.")


🔁 Found 1503 rows to backfill...


TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# 1. Create a pipeline (if you're doing preprocessing)
pipe = Pipeline([
    ('scaler', StandardScaler()),  # Optional preprocessing
    ('classifier', RandomForestClassifier(random_state=42))
])

# 2. Define your parameter grid
rf_param_grid = {
    'classifier__n_estimators': [30, 150],
    'classifier__max_depth': [34],
    'classifier__min_samples_split': [25, 26],
    'classifier__min_samples_leaf': [2, 3]
}

# 3. Initialize GridSearchCV with the pipeline
grid_search = GridSearchCV(
    estimator=pipeline_rf,  # Your pipeline or RandomForestClassifier directly
    param_grid=rf_param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1  # Use all available CPU cores
)

# 4. Fit the model
grid_search.fit(X_train, y_train)

# 5. Print results
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Score: {grid_search.best_score_:.3f}")

# 6. Get the best model
best_rf = grid_search.best_estimator_
#importances = best_rf.named_steps['classifier'].feature_importances_
#importances

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipeline_rf, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy: {np.mean(scores):.2f} ± {np.std(scores):.2f}")


In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np

# Expanded and optimized parameter grid
lr_param_grid = {
    'classifier__C': np.logspace(-3, .75, 4),  # Wider range with logarithmic spacing [-3, 1, 20]
    'classifier__solver': ['liblinear'],  # Both support L1 regularization ['liblinear', 'saga']
    'classifier__penalty': ['l1','l2'],  # Test both regularization types ['l1','l2']
    'classifier__max_iter': [130,200],  # Ensure convergence
    'classifier__class_weight': [None]  # Handle class imbalance [None, 'balanced']
}

# Enhanced GridSearchCV setup
grid_search_lr = GridSearchCV(
    estimator=pipeline_lr,
    param_grid=lr_param_grid,
    cv=5,  # More folds for better validation
    scoring='accuracy',
    n_jobs=-1,  # Parallelize computation
    verbose=1,  # Show progress
    return_train_score=True  # For learning curve analysis
)

# Fit with timing
import time
start_time = time.time()
grid_search_lr.fit(X_train, y_train)
end_time = time.time()

# Enhanced results reporting
print(f"\nBest Parameters: {grid_search_lr.best_params_}")
print(f"Best CV Accuracy: {grid_search_lr.best_score_:.4f}")
print(f"Training Time: {end_time - start_time:.2f} seconds")

# Additional useful outputs
print("\nTop 5 Parameter Combinations:")
results = pd.DataFrame(grid_search_lr.cv_results_)
top_results = results.sort_values('mean_test_score', ascending=False).head(5)
print(top_results[['params', 'mean_test_score', 'std_test_score']])

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipeline_lr, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy: {np.mean(scores):.2f} ± {np.std(scores):.2f}")


In [None]:
gb_param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__learning_rate': [0.01, 0.1],
    'classifier__max_depth': [3, 7]
}

grid_search_gb = GridSearchCV(estimator=pipeline_gb, param_grid=gb_param_grid, cv=3, scoring='accuracy')
grid_search_gb.fit(X_train, y_train)

print(f"Best Gradient Boosting Parameters: {grid_search_gb.best_params_}")

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

# 1. First create a preprocessing pipeline that handles:
#    - Missing values (None/NaN)
#    - String values that should be numeric
#    - Feature scaling

preprocessor = ColumnTransformer(
    transformers=[
        ('num', make_pipeline(
            SimpleImputer(strategy='median'),  # Handles None/NaN
            StandardScaler()
        ), X_train.select_dtypes(include=['number']).columns),
    ],
    remainder='drop'  # Drops non-numeric columns
)

# 2. Now create the full pipeline with preprocessing and SVM
pipeline_svm = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SVC(random_state=42))
])

# 3. Use a simpler parameter grid for initial testing
svm_param_grid = {
    'classifier__C': [.75, 1, 3],
    'classifier__kernel': ['linear', 'rbf'],
    'classifier__gamma': ['scale']
}

# 4. Run grid search with error_score='raise' to see exact errors
grid_search_svm = GridSearchCV(
    estimator=pipeline_svm,
    param_grid=svm_param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    error_score='raise'  # Will show exact error if any
)

# 5. Fit the model
print("Starting SVM hyperparameter tuning...")
try:
    grid_search_svm.fit(X_train, y_train)
    print(f"\nBest SVM Parameters: {grid_search_svm.best_params_}")
    print(f"Best Cross-Validation Accuracy: {grid_search_svm.best_score_:.3f}")
except Exception as e:
    print(f"\nError during fitting: {str(e)}")
    print("\nCheck your input data for:")
    print("- Non-numeric values (especially strings like 'None')")
    print("- Missing values (NaN or None)")
    print("- Infinite values")

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipeline_svm, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy: {np.mean(scores):.2f} ± {np.std(scores):.2f}")

In [None]:
cb_param_grid = {
    'classifier__iterations': [140, 145],
    'classifier__learning_rate': [0.15, 0.2],
    'classifier__depth': [3, 4]
}

grid_search_cb = GridSearchCV(estimator=pipeline_catboost, param_grid=cb_param_grid, cv=3, scoring='accuracy')
grid_search_cb.fit(X_train, y_train)

print(f"Best CatBoost Parameters: {grid_search_cb.best_params_}")

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipeline_catboost, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy: {np.mean(scores):.2f} ± {np.std(scores):.2f}")

In [None]:
## XGB BOOSTER

from sklearn.model_selection import RandomizedSearchCV

xgb_param_grid = {
    'classifier__n_estimators': [50, 75, 100, 150],
    'classifier__learning_rate': [0.01, 0.05, 0.1],
    'classifier__max_depth': [3, 5, 7],
    'classifier__subsample': [0.6, 0.8, 1.0],
    'classifier__colsample_bytree': [0.5, 0.7, 1.0],
    'classifier__gamma': [0, 1, 5],  # min split loss
    'classifier__reg_lambda': [1, 5, 10],  # L2
    'classifier__reg_alpha': [0, 0.5, 1],  # L1
}


random_search_xgb = RandomizedSearchCV(
    estimator=pipeline_xgb,
    param_distributions=xgb_param_grid,
    n_iter=50,  # try 50 combinations
    cv=3,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1,
    verbose=1
)

random_search_xgb.fit(X_train, y_train)
print(f"Best XGBoost Parameters: {random_search_xgb.best_params_}")


In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipeline_xgb, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy: {np.mean(scores):.2f} ± {np.std(scores):.2f}")

In [None]:
# Add to your existing classifier/pipeline section
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier

# Add to your pipeline definitions
pipeline_knn = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier())
])

# Add to your model combinations list
models = [
    # ... existing models ...
    ('knn', pipeline_knn)
]

# Then you can run grid search with your shown parameters
knn_param_grid = {
    'classifier__n_neighbors': [15, 20, 25],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__metric': ['euclidean', 'manhattan']
}

grid_search_knn = GridSearchCV(estimator=pipeline_knn, 
                              param_grid=knn_param_grid, 
                              cv=3, 
                              scoring='accuracy',
                              n_jobs=-1)
grid_search_knn.fit(X_train, y_train)
print(f"Best k-NN Parameters: {grid_search_knn.best_params_}")

# Preprocessor remains the same as in your existing code
transformers = [
    ('num', Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]), numeric_columns)
]
if categorical_columns:
    transformers.append(('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns))
preprocessor = ColumnTransformer(transformers)

# Define all model pipelines
pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=90,
        max_depth=14,
        min_samples_split=10,
        min_samples_leaf=3,
        random_state=42
    ))
])

pipeline_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        C=0.1,
        solver='liblinear',
        penalty='l1',
        random_state=42
    ))
])

pipeline_svm = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(
        C=1.0,
        kernel='rbf',
        gamma='auto',
        probability=True,
        random_state=42
    ))
])

pipeline_lgbm = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier(
        n_estimators=197,
        learning_rate=0.24,
        max_depth=11,
        num_leaves=24,
        subsample=0.62,
        random_state=42
    ))
])

pipeline_et = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', ExtraTreesClassifier(
        n_estimators=35,
        max_depth=8,
        min_samples_split=2,
        min_samples_leaf=3,
        random_state=42
    ))
])

pipeline_nb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GaussianNB())
])

pipeline_ridge = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RidgeClassifier(
        alpha=13.15,
        solver='auto',
        random_state=42
    ))
])

In [None]:
##pipeline_lgbm

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd

# 1. Clean and prepare data
X_train = X_train.replace({
    'False': False, 
    'True': True,
    'None': np.nan,
    'nan': np.nan
})

# Convert all boolean columns to float (0.0/1.0)
bool_cols = X_train.select_dtypes(include=['bool']).columns
X_train[bool_cols] = X_train[bool_cols].astype(float)

# 2. Identify column types
numeric_cols = X_train.select_dtypes(include=['number']).columns
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns

print(f"Numeric columns: {len(numeric_cols)}")
print(f"Categorical columns: {len(categorical_cols)}")

# 3. Create transformers
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# 4. Create column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='drop'
)

# 5. LightGBM pipeline
pipeline_lgbm = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier(
        random_state=42,
        verbose=-1
    ))
])

# 6. Minimal parameter grid for testing
lgbm_param_grid = {
    'classifier__n_estimators': [220, 260],  # 
    'classifier__learning_rate': [0.09, 0.11],  # Wider range
    'classifier__max_depth': [-1],  # -1 means no limit
    'classifier__num_leaves': [23, 24],  # Should be <= 2^max_depth
    'classifier__subsample': [0.6,0.7],  # More reasonable fractions
    'classifier__colsample_bytree': [0.6],  # Added feature subsampling
    'classifier__reg_alpha': [0.2, 0.25],  # L1 regularization
    'classifier__reg_lambda': [0.1]  # L2 regularization
}

# 7. Run grid search
grid_search_lgbm = GridSearchCV(
    estimator=pipeline_lgbm,
    param_grid=lgbm_param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    error_score='raise'
)

# 8. Fit the model
print("\nStarting LightGBM hyperparameter tuning...")
try:
    grid_search_lgbm.fit(X_train, y_train)
    print(f"\nBest Parameters: {grid_search_lgbm.best_params_}")
    print(f"Best Accuracy: {grid_search_lgbm.best_score_:.3f}")
    
except Exception as e:
    print(f"\nError during fitting: {str(e)}")
    print("\nFinal debugging steps:")
    print("1. Check for any remaining non-numeric values:")
    print(X_train.apply(lambda x: pd.api.types.is_numeric_dtype(x)).value_counts())
    print("\n2. Check sample of each column type:")
    print("\nNumeric columns sample:")
    print(X_train[numeric_cols].head(2))
    print("\nCategorical columns sample:")
    print(X_train[categorical_cols].head(2))

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipeline_lgbm, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy: {np.mean(scores):.2f} ± {np.std(scores):.2f}")

In [None]:
##EXTRA TREES

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd

# 1. Clean and prepare data
X_train = X_train.replace({
    'False': False, 
    'True': True,
    'None': np.nan,
    'nan': np.nan
})

# Convert all boolean columns to float (0.0/1.0)
bool_cols = X_train.select_dtypes(include=['bool']).columns
X_train[bool_cols] = X_train[bool_cols].astype(float)

# 2. Identify column types
numeric_cols = X_train.select_dtypes(include=['number']).columns
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns

print(f"Numeric columns: {len(numeric_cols)}")
print(f"Categorical columns: {len(categorical_cols)}")

# 3. Create transformers
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())  # Optional but can help feature importance
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# 4. Create column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='drop'
)

# 5. Extra Trees pipeline
pipeline_et = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', ExtraTreesClassifier(
        random_state=42,
        n_jobs=-1  # Use all cores
    ))
])

# 6. Parameter grid - expanded version
et_param_grid = {
    'classifier__n_estimators': [120, 130],
    'classifier__max_depth': [None],  # None for unlimited depth
    'classifier__min_samples_split': [4, 5],
    'classifier__min_samples_leaf': [2],
    'classifier__max_features': [0.5],  # Feature selection
    'classifier__bootstrap': [False]  # Whether to bootstrap samples
}

# 7. Run grid search with error handling
grid_search_et = GridSearchCV(
    estimator=pipeline_et,
    param_grid=et_param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,  # Parallel processing
    verbose=1,
    error_score='raise'
)

# 8. Fit the model
print("\nStarting Extra Trees hyperparameter tuning...")
try:
    grid_search_et.fit(X_train, y_train)
    print(f"\nBest Parameters: {grid_search_et.best_params_}")
    print(f"Best Accuracy: {grid_search_et.best_score_:.3f}")
    
    # Feature importance
    if hasattr(grid_search_et.best_estimator_.named_steps['classifier'], 'feature_importances_'):
        print("\nTop 10 Features:")
        importances = grid_search_et.best_estimator_.named_steps['classifier'].feature_importances_
        feature_names = (
            numeric_cols.tolist() + 
            grid_search_et.best_estimator_.named_steps['preprocessor']
                .named_transformers_['cat']
                .named_steps['onehot']
                .get_feature_names_out(categorical_cols).tolist()
        )
        for feat, imp in sorted(zip(feature_names, importances), 
                              key=lambda x: x[1], reverse=True)[:10]:
            print(f"{feat}: {imp:.4f}")
            
except Exception as e:
    print(f"\nError during fitting: {str(e)}")
    print("\nDebugging steps:")
    print("1. Check for remaining non-numeric values:")
    print(X_train.apply(lambda x: pd.api.types.is_numeric_dtype(x)).value_counts())
    print("\n2. Check sample data:")
    print("\nNumeric columns:")
    print(X_train[numeric_cols].head(2))
    print("\nCategorical columns:")
    print(X_train[categorical_cols].head(2))

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipeline_et, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy: {np.mean(scores):.2f} ± {np.std(scores):.2f}")

In [None]:
from sklearn.naive_bayes import GaussianNB

nb_param_grid = {
    'classifier__var_smoothing': [1e-9, 1e-8, 1e-7]
}

grid_search_nb = GridSearchCV(estimator=pipeline_nb, param_grid=nb_param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_nb.fit(X_train, y_train)

print(f"Best Naive Bayes Parameters: {grid_search_nb.best_params_}")


In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipeline_nb, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy: {np.mean(scores):.2f} ± {np.std(scores):.2f}")

In [None]:
from sklearn.linear_model import RidgeClassifier

ridge_param_grid = {
    'classifier__alpha': [13.15, 13.25, 13.05],
    'classifier__solver': ['auto', 'svd', 'cholesky']
}

grid_search_ridge = GridSearchCV(estimator=pipeline_ridge, param_grid=ridge_param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_ridge.fit(X_train, y_train)

print(f"Best Ridge Classifier Parameters: {grid_search_ridge.best_params_}")


In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipeline_ridge, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy: {np.mean(scores):.2f} ± {np.std(scores):.2f}")

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_param_grid = {
    'classifier__n_neighbors': [15, 20, 25],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__metric': ['euclidean', 'manhattan']
}

grid_search_knn = GridSearchCV(estimator=pipeline_knn, param_grid=knn_param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search_knn.fit(X_train, y_train)

print(f"Best k-NN Parameters: {grid_search_knn.best_params_}")


In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipeline_knn, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy: {np.mean(scores):.2f} ± {np.std(scores):.2f}")

In [None]:
from itertools import combinations

# List of models
models = [
    ('random_forest', pipeline_rf),
    ('logistic_regression', pipeline_lr),
    ('gradient_boosting', pipeline_gb),
    ('catboost', pipeline_catboost),
    ('xgboost', pipeline_xgb),
    ('svm', pipeline_svm),
    ('lightgbm', pipeline_lgbm),
    ('extra_trees', pipeline_et),
    ('naive_bayes', pipeline_nb)
#    ('ridge', pipeline_ridge),
#    ('knn', pipeline_knn)  # Added KNN here
]

# Generate all combinations (choose subsets of length 2 to all)
for r in range(2, len(models) + 1):  # Minimum of 2 models
    for combo in combinations(models, r):
        print(f"Evaluating combination: {[name for name, _ in combo]}")


Optimize ensemble model selection with Randomized Search

In [None]:
from sklearn.metrics import accuracy_score

best_score = 0
best_combination = None

# Generate and evaluate each combination
for r in range(2, len(models) + 1):
    for combo in combinations(models, r):
        # Create a VotingClassifier with the current combination
        ensemble = VotingClassifier(estimators=list(combo), voting='soft')
        
        # Fit the model
        ensemble.fit(X_train, y_train)
        
        # Evaluate on the test set
        y_pred = ensemble.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        print(f"Combination: {[name for name, _ in combo]} | Accuracy: {score:.4f}")
        
        # Track the best score and combination
        if score > best_score:
            best_score = score
            best_combination = combo

print(f"Best Combination: {[name for name, _ in best_combination]} | Accuracy: {best_score:.4f}")


In [None]:
import pandas as pd

results = []

for r in range(2, len(models) + 1):
    for combo in combinations(models, r):
        ensemble = VotingClassifier(estimators=list(combo), voting='soft')
        ensemble.fit(X_train, y_train)
        y_pred = ensemble.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        results.append({'combination': [name for name, _ in combo], 'accuracy': score})

# Convert to DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='accuracy', ascending=False)

# Display the best combinations
print(results_df.head(10))
#Combination: ['gradient_boosting', 'catboost'] | Accuracy: 0.7042

In [None]:
import pandas as pd

results = []

for r in range(2, len(models) + 1):
    for combo in combinations(models, r):
        ensemble = VotingClassifier(estimators=list(combo), voting='soft')
        ensemble.fit(X_train, y_train)
        y_pred = ensemble.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        results.append({'combination': [name for name, _ in combo], 'accuracy': score})

# Convert to DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='accuracy', ascending=False)

# Display the best combinations
print(results_df.head())


In [None]:
best_combination_names = [name for name, _ in best_combination]
print(f"Best Models: {best_combination_names}")


In [None]:
# 172        [random_forest, svm, lightgbm, extra_trees]
# 65   [logistic_regression, gradient_boosting, xgboost]

# Meta-Ensemble

In [None]:
# First ensemble
ensemble_1 = VotingClassifier(
    estimators=[
#        ('gradient_boosting', pipeline_gb),
        ('randomforest', pipeline_rf),
        ('svm', pipeline_svm),
        ('lightgbm', pipeline_lgbm),
        ('extratrees', pipeline_et)
    ],
    voting='soft'
)

# Second ensemble
ensemble_2 = VotingClassifier(
    estimators=[
        ('gradient_boosting', pipeline_gb),
        ('logistic_regression', pipeline_lr),
#        ('lightgbm', pipeline_lgbm)
#        ('catboost', pipeline_catboost),
        ('xgboost', pipeline_xgb)
#        ('svm', pipeline_svm)
    ],
    voting='soft'
)




In [None]:
from tqdm.notebook import tqdm

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the first-level ensembles with progress bar
print("Training First-Level Ensembles...")
with tqdm(total=2, desc="First-Level Ensembles") as pbar:
    ensemble_1.fit(X_train, y_train)  # Fit ensemble_1
    pbar.update(1)
    ensemble_2.fit(X_train, y_train)  # Fit ensemble_2
    pbar.update(1)


In [None]:
# Define the meta-ensemble
meta_ensemble = VotingClassifier(
    estimators=[
        ('ensemble_1', ensemble_1),
        ('ensemble_2', ensemble_2)
    ],
    voting='soft'
)

# Fit the meta-ensemble
print("Training Meta-Ensemble...")
meta_ensemble.fit(X_train, y_train)


In [None]:
# Evaluate the meta-ensemble
y_pred_meta = meta_ensemble.predict(X_test)
accuracy_meta = accuracy_score(y_test, y_pred_meta)
print(f'Meta-Ensemble Accuracy: {accuracy_meta:.2f}')
print("\nMeta-Ensemble Classification Report:")
print(classification_report(y_test, y_pred_meta))
print("\nMeta-Ensemble Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_meta))


In [None]:
results_df.head(10)

In [None]:

# ✅ Deduplicate predictions before saving
predictions_df = predictions_df.sort_values(by='confidence', ascending=False)
predictions_df = predictions_df.drop_duplicates(subset=['date', 'term'], keep='first')

# Optional sanity check
assert predictions_df[['date', 'term']].duplicated().sum() == 0, "Still has duplicates!"

# Save cleaned predictions
predictions_df.to_csv("predictions_log.csv", index=False)
print("✅ Cleaned predictions_log.csv saved.")
