## Approximate Nearest Neighbours for Stock Market Pattern Search/Recognition
The developed notebook's purpose is to retrieve candlestick patterns in the stock market that has a similar pattern to your input pattern. For this task, approximate nearest neighbor algorithms are used to execute the task of vector similarity search. To feed the search space, you need a CSV file with the following information: 

* Time (YYYY-MM-DD), open, high, low, close. 

The following row is an example of this format 
* 2015-08-07T00:00:00Z, 2.831620, 3.524990, 2.590420, 2.789720

From the fed CSV file, pick an index of any bar and determine a window size. If you want to look at the latest bar's pattern with the previous 20 bars, the index must be 0 and window_size must be 20.

The rest is explained in the following cells.

Note: This is not an API, and this is only meant for practical purposes. Any feedback or comments are welcome.

### Import Libraries
* Pandas: For Dataframes and Data Management
* Annoy: For Vector Similarity Search
* Yahoo Finances: For Data Retrivial
* Plotly: For Visualizations

In [6]:
#!pip install yfinance

In [7]:
import pandas as pd

from annoy import AnnoyIndex

#import investpy
import yfinance as yf

import plotly.graph_objects as go
import plotly.express as py
from plotly.subplots import make_subplots

In [136]:
def get_data(ticker: str, interval: str):
    """
    ticker = 
    interval = 
    """

    df = yf.download(  # or pdr.get_data_yahoo(...
        # tickers list or string as well
        tickers = ticker,

        # use "period" instead of start/end
        # valid periods: 1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max
        # (optional, default is '1mo')
        period = "10y",

        # fetch data by interval (including intraday if period < 60 days)
        # valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
        # (optional, default is '1d')
        interval = interval,

        # Whether to ignore timezone when aligning ticker data from 
        # different timezones. Default is True. False may be useful for 
        # minute/hourly data.
        ignore_tz = False,

        # group by ticker (to access via data['SPY'])
        # (optional, default is 'column')
        group_by = 'column',

        # adjust all OHLC automatically
        # (optional, default is False)
        auto_adjust = False,

        # identify and attempt repair of currency unit mixups e.g. $/cents
        repair = False,

        # download pre/post regular market hours data
        # (optional, default is False)
        prepost = False,

        # use threads for mass downloading? (True/False/Integer)
        # (optional, default is True)
        threads = True,

        # proxy URL scheme use use when downloading?
        # (optional, default is None)
        proxy = None
    )

    # Sort it from newest bar to oldest bar
    df = df.iloc[::-1]
    df = df.drop(["Volume", "Adj Close"], axis = 1)
    df = df.reset_index()
    df.columns = ["time", "open", "high", "low", "close"]

    # Calculate the daily change
    df["change"] = ( (df.close - df.open) / (df.open * 100) ) * 10000


    # finalize the change dataframe
    change_df = df[["time", "change"]].reset_index().drop(["index"], axis=1).copy()
    
    return df, change_df

In [137]:
# Add previous bars to the most recent bar in the list for the window of window size
def alter_df(change_df, window_size):
    change_list = change_df["change"].tolist()
    
    # Move elements in the list by 1
    for i in range(window_size-1):
        change_list.pop(0)
        change_list.append(0)
        
        # Assign a column name
        change_df["Candle(" + str(i+1) +")"] = change_list
    
    # Create a list of embeddings
    embeddings = change_df.drop("time", axis=1).values.tolist()
    
    # Get rid of the Null Values
    for i in range(window_size-1):
        embeddings.pop()
        
    return embeddings, change_df

In [138]:
def get_pattern_df(check_for: int, window_size: int, future: bool = False):
    
    """
    check_for: int -> Check for the candle stick at index = check_for from the data frame df
    window_size: int -> Look into past 'window_size' bars. If 20, return 20 previous bars.
    future: bool -> Look into future 'window_size' bars if checked. It will return additional bars if available in the dataset.
    
    Return the dataframe of candlesticks which belongs to the inputted pattern.
    """
    
    # Returned indexes from Annoy
    candles_list = []
    
    # If the user wants to see future bars as well, future must be True
    if future:
        # Return future window_size bars
        for i in range(window_size*2, 0, -1):
            candles_list.append(df.iloc[check_for])
            check_for -= 1
            
        check_for += window_size*2
            
    # Return past window_size bars
    for i in range(window_size):
        candles_list.append(df.iloc[check_for])
        check_for += 1
    
    
    return pd.DataFrame(candles_list)

In [146]:

def visualize_patterns(main: int,
                       window_size: int,
                       closest_n: int,
                       future: bool = False) -> None:
    """
    main: The index of the candle stick where it is the newest bar of the input pattern 
    window_size: Length of the input pattern.
    closest_n: return closest n patterns
    future: Set it to true to see future patterns if available
    
    With plotly, visualize the price graph through candlesticks.
    'Main' is the first subgraph, and the closest ones to the main graph are other subgraphs.
    """
    
    vector_size = len(embeddings[0])

    t = AnnoyIndex(vector_size, 'manhattan')
    for i in range(len(embeddings)):
        t.add_item(i, embeddings[i])

    t.build(1000)
    
    nearest_neighs = t.get_nns_by_vector(embeddings[main], closest_n)

    # Remove the duplicates of main bar
    if main in nearest_neighs:
        nearest_neighs.remove(main)
        
    # Get the pattern of the inputted chart
    df_pattern_main = get_pattern_df(main, window_size)
    
    # Create the subgraph spots for down below
    fig = make_subplots(len(nearest_neighs)//2 + 1, cols=2)

    # Print the main graph
    row_i, col_i = 1, 1
    fig.add_trace( go.Candlestick(
                    x=df_pattern_main['time'],
                    open=df_pattern_main['open'],
                    high=df_pattern_main['high'],
                    low=df_pattern_main['low'],
                    close=df_pattern_main['close']),
                    row=1, col=1
                  )
    
    fig.update_xaxes(rangeslider= {'visible':False}, row=1, col=1)
    
    
    
    col_i += 1
    # Print the closest charts in the form of subgraphs
    for i in nearest_neighs:
        df_pattern_next = get_pattern_df(i, window_size, future)
        fig.add_trace( 
            go.Candlestick(
                x=df_pattern_next['time'],
                open=df_pattern_next['open'],
                high=df_pattern_next['high'],
                low=df_pattern_next['low'],
                close=df_pattern_next['close']
            ),
            row=row_i, col=col_i
        )
        if future:
            fig.add_vline(x = df_pattern_next["time"].iloc[window_size*2], row=row_i, col=col_i)
        else:
            fig.add_vline(x = df_pattern_next["time"].iloc[0], row=row_i, col=col_i)
            
        fig.update_xaxes(rangeslider= {'visible':False}, row=row_i, col=col_i)
        
        col_i = (col_i%2) + 1
        row_i = row_i + 1 if col_i == 1 else row_i
        
            


    fig.update_layout(height=1000, width=1000, xaxis_rangeslider_visible=False)
    fig.show()

In [148]:
# PARAM WINDOW BEGIN ----------------------------

# How many candle sticks you want to look into
window_size = 25

closest_n = 6
input_pattern_index = 0

# PARAM WINDOW END ------------------------------
df, change_df = get_data(ticker = "BTC-USD", interval = "1D")


embeddings, change_df = alter_df(change_df, window_size)

vector_size = len(embeddings[0])

t = AnnoyIndex(vector_size, 'manhattan')
for i in range(len(embeddings)):
    t.add_item(i, embeddings[i])

t.build(100)

visualize_patterns(input_pattern_index,
                   window_size,
                   closest_n,
                   False)

change_df


[*********************100%***********************]  1 of 1 completed


Unnamed: 0,time,change,Candle(1),Candle(2),Candle(3),Candle(4),Candle(5),Candle(6),Candle(7),Candle(8),...,Candle(15),Candle(16),Candle(17),Candle(18),Candle(19),Candle(20),Candle(21),Candle(22),Candle(23),Candle(24)
0,2022-11-28 00:00:00+00:00,-1.326075,-0.116958,-0.346788,-0.484438,-0.043174,2.563156,2.581808,-3.093319,-2.519535,...,-2.656933,-1.395147,-3.122056,10.725910,-14.360525,-9.996763,-1.537924,-1.684611,0.651977,4.643830
1,2022-11-27 00:00:00+00:00,-0.116958,-0.346788,-0.484438,-0.043174,2.563156,2.581808,-3.093319,-2.519535,0.091794,...,-1.395147,-3.122056,10.725910,-14.360525,-9.996763,-1.537924,-1.684611,0.651977,4.643830,0.234586
2,2022-11-26 00:00:00+00:00,-0.346788,-0.484438,-0.043174,2.563156,2.581808,-3.093319,-2.519535,0.091794,0.059116,...,-3.122056,10.725910,-14.360525,-9.996763,-1.537924,-1.684611,0.651977,4.643830,0.234586,-1.579142
3,2022-11-25 00:00:00+00:00,-0.484438,-0.043174,2.563156,2.581808,-3.093319,-2.519535,0.091794,0.059116,0.102528,...,10.725910,-14.360525,-9.996763,-1.537924,-1.684611,0.651977,4.643830,0.234586,-1.579142,-0.046963
4,2022-11-24 00:00:00+00:00,-0.043174,2.563156,2.581808,-3.093319,-2.519535,0.091794,0.059116,0.102528,-1.272791,...,-14.360525,-9.996763,-1.537924,-1.684611,0.651977,4.643830,0.234586,-1.579142,-0.046963,-0.668430
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2990,2014-09-21 00:00:00+00:00,-2.270110,3.605767,-6.910351,-7.096262,-1.831006,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2991,2014-09-20 00:00:00+00:00,3.605767,-6.910351,-7.096262,-1.831006,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2992,2014-09-19 00:00:00+00:00,-6.910351,-7.096262,-1.831006,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2993,2014-09-18 00:00:00+00:00,-7.096262,-1.831006,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
