This notebook covers data extraction for the dataset used to train the CELFANN model in the research paper "A hybrid stock trading framework integrating technical analysis with machine learning" (Dash & Dash, 2016)

This notebook extracts financial data used to train our implementation of the CEFLANN model from "A hybrid stock trading framework integrating technical analysis with machine learning" (Dash & Dash, 2016)

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import yfinance as yf
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from datetime import datetime
plt.style.use('seaborn-v0_8-darkgrid')

In [2]:
# Set data parameters
start_date = '2020-01-01'
end_date = '2025-03-14'  
tickers = {
    'NVIDIA': 'NVDA',
    'Taiwan Semiconductors': 'TSM',
    'Invesco QQQ': 'QQQ'
}

In [3]:
# Downloading data
data = {}
for name, ticker in tickers.items():
    print(f"Downloading data for {name} ({ticker})...")
    data[name] = yf.download(ticker, start=start_date, end=end_date)
    print(f"Downloaded {len(data[name])} days of data for {name}.")

Downloading data for NVIDIA (NVDA)...
YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Downloaded 1306 days of data for NVIDIA.
Downloading data for Taiwan Semiconductors (TSM)...



[*********************100%***********************]  1 of 1 completed

Downloaded 1306 days of data for Taiwan Semiconductors.
Downloading data for Invesco QQQ (QQQ)...
Downloaded 1306 days of data for Invesco QQQ.





In [4]:
# Extract closing prices for correlation analysis
close_prices = pd.DataFrame()
for name, ticker_data in data.items():
    close_prices[name] = ticker_data['Close']

print(close_prices)
print(type(close_prices))

                NVIDIA  Taiwan Semiconductors  Invesco QQQ
Date                                                      
2020-01-02    5.972161              54.504330   209.638077
2020-01-03    5.876570              52.706879   207.717773
2020-01-06    5.901215              52.098648   209.056107
2020-01-07    5.972660              52.942913   209.027054
2020-01-08    5.983862              53.333256   210.598190
...                ...                    ...          ...
2025-03-07  112.679642             177.100006   491.790009
2025-03-10  106.970161             170.649994   472.730011
2025-03-11  108.750000             170.970001   471.600006
2025-03-12  115.739998             177.169998   476.920013
2025-03-13  115.580002             171.589996   468.339996

[1306 rows x 3 columns]
<class 'pandas.core.frame.DataFrame'>


In [None]:
# Calculate correlation matrix
correlation = close_prices.corr()
print("\nPrice Correlation Matrix:")
print(correlation)

# Visualize correlation
plt.figure(figsize=(8, 6))
sns.heatmap(correlation, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Closing Prices Correlation Matrix')
plt.tight_layout()
plt.show()

In [6]:
for name, ticker_data in data.items():
    print (data[name])

Price            Close        High         Low        Open     Volume
Ticker            NVDA        NVDA        NVDA        NVDA       NVDA
Date                                                                 
2020-01-02    5.972161    5.972161    5.892752    5.943285  237536000
2020-01-03    5.876570    5.920383    5.827530    5.852424  205384000
2020-01-06    5.901215    5.906442    5.757083    5.783220  262636000
2020-01-07    5.972660    6.018463    5.884537    5.929594  314856000
2020-01-08    5.983862    6.025185    5.928350    5.968428  277108000
...                ...         ...         ...         ...        ...
2025-03-07  112.679642  113.469570  107.550108  111.239772  341755500
2025-03-10  106.970161  111.839709  105.450297  109.889891  366487400
2025-03-11  108.750000  112.229676  104.760361  106.980159  354865700
2025-03-12  115.739998  116.760002  112.879997  114.120003  323857500
2025-03-13  115.580002  117.760002  113.790001  117.029999  299033100

[1306 rows x 5 colu

In [7]:
# Function calculates six technical indicators used by the CEFLANN model
def calculate_technical_indicators(df):

    data = df.copy()

    # Convert all dataframe columns to series for calculations
    close_series = data['Close'].squeeze()
    low_series = data['Low'].squeeze()
    high_series = data['High'].squeeze()
    
    # 1: Simple Moving Average (MA15)
    data['MA15'] = close_series.rolling(window=15).mean()
    
    # 2: MACD (12, 26)
    data['EMA12'] = close_series.ewm(span=12, adjust=False).mean()
    data['EMA26'] = close_series.ewm(span=26, adjust=False).mean()
    data['MACD26'] = data['EMA12'] - data['EMA26']
    
    # 3: Stochastic Oscillator K14
    n = 14
    data['L14'] = low_series.rolling(window=n).min()
    data['H14'] = high_series.rolling(window=n).max()
    data['K14'] = 100 * ((close_series - data['L14']) / (data['H14'] - data['L14']))
    
    # 4: Stochastic Oscillator D3
    data['D3'] = data['K14'].rolling(window=3).mean()
    
    # 5: Relative Strength Index (14)
    delta = close_series.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    data['RSI14'] = 100 - (100 / (1 + rs))
    
    # 6: Larry Williams R% (WR14)
    data['WR14'] = -100 * ((data['H14'] - close_series) / (high_series - low_series))
    
    # Clean up columns
    data = data.drop(['L14', 'H14', 'EMA12', 'EMA26'], axis=1)
    
    # Columns added: MA15, MACD26, K14, D3, RSI14, WR14
    return data

In [8]:
# Calculate technical indicators for all tickers
tech_data = data.copy()
for name in tickers.keys():
    tech_data[name] = calculate_technical_indicators(tech_data[name])
    

In [9]:
selected_data = {ticker: df[['MA15', 'MACD26', 'K14', 'D3', 'RSI14', 'WR14']] 
                 for ticker, df in tech_data.items()}
print(selected_data)

{'NVIDIA': Price             MA15    MACD26        K14         D3      RSI14        WR14
Ticker                                                                       
Date                                                                         
2020-01-02         NaN  0.000000        NaN        NaN        NaN         NaN
2020-01-03         NaN -0.007625        NaN        NaN        NaN         NaN
2020-01-06         NaN -0.011547        NaN        NaN        NaN         NaN
2020-01-07         NaN -0.008789        NaN        NaN        NaN         NaN
2020-01-08         NaN -0.005634        NaN        NaN        NaN         NaN
...                ...       ...        ...        ...        ...         ...
2025-03-07  126.381043 -5.053963  14.297671  12.416652  27.671568 -519.425150
2025-03-10  124.256572 -5.779603   4.222232   6.633344  24.568575 -539.593344
2025-03-11  122.214094 -6.140279  10.874909   9.798271  26.687085 -437.751040
2025-03-12  120.648947 -5.795278  29.928161  15.00843

In [10]:
# Normalize indicators using Min-Max normalization
# The function returns a dataframe of normalized indicator values, along with the statistical distribution of each indicator.
def normalize_indicators(df):

    indicators = ['MA15', 'MACD26', 'K14', 'D3', 'RSI14', 'WR14']
    
    normalized_df = df[indicators].copy()
    stats = {}
    for indicator in indicators:
        stats[indicator] = {
            'min': df[indicator].min(),
            'max': df[indicator].max(),
            'mean': df[indicator].mean(),
            'std': df[indicator].std()
        }
        
       
        normalized_df[indicator] = (df[indicator] - stats[indicator]['min']) / \
                                  (stats[indicator]['max'] - stats[indicator]['min'])
    
    
    normalized_df['Close'] = df['Close']
    
    return normalized_df, stats

In [11]:
# Normalize the technical indicators for all stocks
normalized_data = {}
indicator_stats = {}

for name in tickers.keys():
    normalized_data[name], indicator_stats[name] = normalize_indicators(tech_data[name])
    print(f"Normalized technical indicators for {name}")
    #dx = tech_data[name]
    #print(dx['MA15', 'MACD26', 'K14', 'D3', 'RSI14', 'WR14'])

Normalized technical indicators for NVIDIA
Normalized technical indicators for Taiwan Semiconductors
Normalized technical indicators for Invesco QQQ


In [12]:
# Display summary statistics for NVDA indicators
indicator_summary = pd.DataFrame(indicator_stats['NVIDIA']).T.round(2)
indicator_summary
#normalized_data['NVIDIA'].tail()

Unnamed: 0,min,max,mean,std
MA15,5.75,144.53,41.58,40.97
MACD26,-6.14,9.77,0.65,1.86
K14,0.19,100.0,58.99,30.62
D3,0.95,99.2,58.99,28.49
RSI14,8.19,96.99,55.94,16.76
WR14,-1276.84,-0.0,-197.97,178.24


In [13]:
nvda_df = tech_data['NVIDIA'].copy()
nvda_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1306 entries, 2020-01-02 to 2025-03-13
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   (Close, NVDA)   1306 non-null   float64
 1   (High, NVDA)    1306 non-null   float64
 2   (Low, NVDA)     1306 non-null   float64
 3   (Open, NVDA)    1306 non-null   float64
 4   (Volume, NVDA)  1306 non-null   int64  
 5   (MA15, )        1292 non-null   float64
 6   (MACD26, )      1306 non-null   float64
 7   (K14, )         1293 non-null   float64
 8   (D3, )          1291 non-null   float64
 9   (RSI14, )       1293 non-null   float64
 10  (WR14, )        1293 non-null   float64
dtypes: float64(10), int64(1)
memory usage: 122.4 KB


Issues: Alignment error between the columns 'MA15' and 'Close'.
Solution: Align columns first.

In [14]:
# Some basic information about the DataFrame
print("DataFrame type:", type(nvda_df))
print("DataFrame shape:", nvda_df.shape)
print("DataFrame columns:", nvda_df.columns)
print("Column types:", nvda_df.dtypes)

# Inspect the following rows
print(nvda_df.iloc[0:31, 5:10])

print("\nClose column type:", type(nvda_df['Close']))
print("MA15 column type:", type(nvda_df['MA15']))

DataFrame type: <class 'pandas.core.frame.DataFrame'>
DataFrame shape: (1306, 11)
DataFrame columns: MultiIndex([( 'Close', 'NVDA'),
            (  'High', 'NVDA'),
            (   'Low', 'NVDA'),
            (  'Open', 'NVDA'),
            ('Volume', 'NVDA'),
            (  'MA15',     ''),
            ('MACD26',     ''),
            (   'K14',     ''),
            (    'D3',     ''),
            ( 'RSI14',     ''),
            (  'WR14',     '')],
           names=['Price', 'Ticker'])
Column types: Price   Ticker
Close   NVDA      float64
High    NVDA      float64
Low     NVDA      float64
Open    NVDA      float64
Volume  NVDA        int64
MA15              float64
MACD26            float64
K14               float64
D3                float64
RSI14             float64
WR14              float64
dtype: object
Price           MA15    MACD26        K14         D3      RSI14
Ticker                                                         
Date                                               

In [None]:
# Uses the MA15 to determine uptrend and downtrend sections.
# Extract data
nvda_df = tech_data['NVIDIA'].copy()
close_prices = nvda_df[('Close', 'NVDA')]
ma15_values = nvda_df[('MA15', '')]

aligned_rows = close_prices.notna() & ma15_values.notna() #Boolean table

# Aligned mask
uptrend_mask = (close_prices > ma15_values) & aligned_rows
downtrend_mask = (close_prices < ma15_values) & aligned_rows

# For data structures debugging
#print("\nClose column type:", type(close_prices))
#print("MA15 column type:", type(ma15_values))
#print("Uptrend:", uptrend_mask.iloc[1:30])
#print("Downtrend:", downtrend_mask.iloc[1:30])

uptrend = nvda_df.loc[uptrend_mask]
downtrend = nvda_df.loc[downtrend_mask]

# Plot results
plt.figure(figsize=(15, 10))

# Close price and MA15
plt.plot(nvda_df.index, close_prices, label='Close Price', color='blue')
plt.plot(nvda_df.index, ma15_values, label='MA15', color='red', linestyle='--')

# Uptrend and downtrend periods
plt.scatter(uptrend.index, uptrend[('Close', 'NVDA')], color='green', alpha=0.5, s=15, label='Uptrend')
plt.scatter(downtrend.index, downtrend[('Close', 'NVDA')], color='red', alpha=0.5, s=15, label='Downtrend')

plt.legend()
plt.title('NVIDIA Trend Analysis (2020-2025)', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Price', fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [16]:
# Restructured data function
def restructure_dataframes(data):
    restructured_data = {}
    
    for name, df in data.items():

        new_df = pd.DataFrame()
        
        # First pass - create technical indicator columns without ticker
        for col in df.columns:
            if isinstance(col, tuple):
                if col[0] in ['MA15', 'MACD26', 'K14', 'D3', 'RSI14', 'WR14']:
                    # Technical indicators get simple names
                    new_col_name = col[0]
                    new_df[new_col_name] = df[col]
                elif len(col) > 1 and col[1]:
                    # Price data gets ticker-specific names
                    new_col_name = f"{col[0]}_{col[1]}"
                    new_df[new_col_name] = df[col]
        
        # Create simple price columns for the current ticker only
        ticker_code = tickers[name]
        new_df['Close'] = new_df[f'Close_{ticker_code}']
        new_df['High'] = new_df[f'High_{ticker_code}']
        new_df['Low'] = new_df[f'Low_{ticker_code}']
        new_df['Open'] = new_df[f'Open_{ticker_code}']
        new_df['Volume'] = new_df[f'Volume_{ticker_code}']
        
        new_df.index = df.index
        restructured_data[name] = new_df
    
    return restructured_data

# Apply the restructuring
df_new = tech_data.copy()
new_tech_data = restructure_dataframes(df_new)

In [17]:
for name in tickers.keys():
    print(f"Dataframe info for {name}:", new_tech_data[name].info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1306 entries, 2020-01-02 to 2025-03-13
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Close_NVDA   1306 non-null   float64
 1   High_NVDA    1306 non-null   float64
 2   Low_NVDA     1306 non-null   float64
 3   Open_NVDA    1306 non-null   float64
 4   Volume_NVDA  1306 non-null   int64  
 5   MA15         1292 non-null   float64
 6   MACD26       1306 non-null   float64
 7   K14          1293 non-null   float64
 8   D3           1291 non-null   float64
 9   RSI14        1293 non-null   float64
 10  WR14         1293 non-null   float64
 11  Close        1306 non-null   float64
 12  High         1306 non-null   float64
 13  Low          1306 non-null   float64
 14  Open         1306 non-null   float64
 15  Volume       1306 non-null   int64  
dtypes: float64(14), int64(2)
memory usage: 173.5 KB
Dataframe info for NVIDIA: None
<class 'pandas.core.frame.DataFram

In [18]:
# This function prepares data for CEFLANN model training
def prepare_data(normalized_df, name, window_size=15):

    df = normalized_df.copy()
    
    # Original dataframe with restructured columns
    original_df = new_tech_data[name]
    
    # Adds the trend column (1 for uptrend, 0 for downtrend)
    df['trend'] = 0
    
    # Creates mask for valid data rows (both Close and MA15 are not NaN)
    valid_mask = original_df['Close'].notna() & original_df['MA15'].notna()
    
    ma_rising = original_df['MA15'].diff(5) > 0
    close_above_ma = original_df['Close'] > original_df['MA15']
    df.loc[valid_mask & ma_rising & close_above_ma, 'trend'] = 1
    
    # Downtrend: Close < MA15 and MA15 falling for 5 days
    ma_falling = original_df['MA15'].diff(5) < 0
    close_below_ma = original_df['Close'] < original_df['MA15']
    # trend=0 is already the default value
    
    # Generate trading signals in range 0-1 based on trend
    df['trading_signal'] = 0.5  # default for no clear trend
    
    for i in range(len(df) - 2):
        if i+3 > len(original_df):
            continue
            
        # Get prices for next 3 days
        prices = original_df['Close'].iloc[i:i+3].values
        
        if len(prices) < 3 or max(prices) <= min(prices):
            continue
            
        min_cp = min(prices)
        max_cp = max(prices)
        current_cp = original_df['Close'].iloc[i]
        
        # Calculate trading signal
        if df['trend'].iloc[i] == 1:  # Uptrend
            df.iloc[i, df.columns.get_loc('trading_signal')] = ((current_cp - min_cp) / (max_cp - min_cp)) * 0.5 + 0.5
        else:  # Downtrend
            df.iloc[i, df.columns.get_loc('trading_signal')] = ((current_cp - min_cp) / (max_cp - min_cp)) * 0.5
    
    # Drop NaN values
    df.dropna(inplace=True)
    
    # Features (X) and target (y) - now with simple column names
    X = df[['MA15', 'MACD26', 'K14', 'D3', 'RSI14', 'WR14']].values
    y = df['trading_signal'].values
    
    return df, X, y

In [19]:
# Preparing the data for CEFLANN model for all indices
ceflann_data = {}
features = {}
targets = {}

for name in tickers.keys():
    
    ceflann_data[name], features[name], targets[name] = prepare_data(normalized_data[name], name)
    print(f"Prepared {len(features[name])} samples for CEFLANN model training for {name}")

Prepared 1291 samples for CEFLANN model training for NVIDIA
Prepared 1291 samples for CEFLANN model training for Taiwan Semiconductors
Prepared 1291 samples for CEFLANN model training for Invesco QQQ


In [20]:
# Display a sample of the prepared data for NVIDIA
#for name in tickers.keys():
print(ceflann_data['NVIDIA'].info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1291 entries, 2020-01-24 to 2025-03-13
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   (MA15, )            1291 non-null   float64
 1   (MACD26, )          1291 non-null   float64
 2   (K14, )             1291 non-null   float64
 3   (D3, )              1291 non-null   float64
 4   (RSI14, )           1291 non-null   float64
 5   (WR14, )            1291 non-null   float64
 6   (Close, )           1291 non-null   float64
 7   (trend, )           1291 non-null   int64  
 8   (trading_signal, )  1291 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 100.9 KB
None


In [21]:
# Split the data into training and testing sets
def split_data(X, y, train_ratio=0.8):
    train_size = int(len(X) * train_ratio)
    
    X_train = X[:train_size]
    y_train = y[:train_size]
    X_test = X[train_size:]
    y_test = y[train_size:]
    
    return X_train, y_train, X_test, y_test

# Split the data for both indices
train_test_data = {}

for name in tickers.keys():
    X_train, y_train, X_test, y_test = split_data(features[name], targets[name])
    train_test_data[name] = {
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test
    }
    print(f"Split {name} data into {len(X_train)} training samples and {len(X_test)} testing samples")

Split NVIDIA data into 1032 training samples and 259 testing samples
Split Taiwan Semiconductors data into 1032 training samples and 259 testing samples
Split Invesco QQQ data into 1032 training samples and 259 testing samples


In [26]:
# Export the dataframes to CSV
new_tech_data['NVIDIA'].to_csv('nvidia_technical_data.csv')
normalized_data['NVIDIA'].to_csv('nvidia_normalized_data.csv')

# Export training and testing sets to npy
np.save('nvidia_X_train.npy', train_test_data['NVIDIA']['X_train'])
np.save('nvidia_y_train.npy', train_test_data['NVIDIA']['y_train'])
np.save('nvidia_X_test.npy', train_test_data['NVIDIA']['X_test'])
np.save('nvidia_y_test.npy', train_test_data['NVIDIA']['y_test'])
with open('nvidia_info.txt', 'w') as f:
    f.write(f"{new_tech_data['NVIDIA'].index[-1]},{new_tech_data['NVIDIA']['Close'].iloc[-1]}")

print("Dataframe, train, and test data has been exported.")

Dataframe, train, and test data has been exported.


In [27]:
# Summary of the dataset preparation
print("Dataset Summary for CEFLANN Model Implementation")
print("="*60)
print(f"Time Period: {start_date} to {end_date}")
print(f"Number of Trading Days: NVIDIA = {len(data['NVIDIA'])}")
print("\nTechnical Indicators Used:")
print("  - Simple Moving Average (MA15)")
print("  - Moving Average Convergence Divergence (MACD26)")
print("  - Stochastic Oscillator K14 and D3")
print("  - Relative Strength Index (RSI14)")
print("  - Larry Williams R% (WR14)")
print("\nData Preprocessing:")
print("  - Calculated technical indicators")
print("  - Normalized indicators to range [0,1]")
print("  - Generated trading signals in range [0,1]")
print("  - Split data into training (80%) and testing (20%) sets")
print("\nThe dataset is now ready for implementing the CEFLANN model with ELM learning")

Dataset Summary for CEFLANN Model Implementation
Time Period: 2020-01-01 to 2025-03-14
Number of Trading Days: NVIDIA = 1306

Technical Indicators Used:
  - Simple Moving Average (MA15)
  - Moving Average Convergence Divergence (MACD26)
  - Stochastic Oscillator K14 and D3
  - Relative Strength Index (RSI14)
  - Larry Williams R% (WR14)

Data Preprocessing:
  - Calculated technical indicators
  - Normalized indicators to range [0,1]
  - Generated trading signals in range [0,1]
  - Split data into training (80%) and testing (20%) sets

The dataset is now ready for implementing the CEFLANN model with ELM learning
