In [1]:
import yfinance
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

# **Reading the CSV files**

In [2]:
# Read the CSV file
filenames = ['sensex_1990_to_2014.csv',
             'nifty_1994_to_2014.csv',
             'nifty_2000_to_2024.csv']

file_name = 'nifty_1994_to_2014.csv'
#file_name = 'nifty_1994_to_2014.csv'
#file_name = 'nifty_2000_to_2024.csv'

data = pd.read_csv(file_name)
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2007-09-17,4518.450195,4549.049805,4482.850098,4494.649902,4494.649902,0
1,2007-09-18,4494.100098,4551.799805,4481.549805,4546.200195,4546.200195,0
2,2007-09-19,4550.25,4739.0,4550.25,4732.350098,4732.350098,0
3,2007-09-20,4734.850098,4760.850098,4721.149902,4747.549805,4747.549805,0
4,2007-09-21,4752.950195,4855.700195,4733.700195,4837.549805,4837.549805,0


In [3]:
data.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,1700.0,1700.0,1700.0,1700.0,1700.0,1700.0
mean,5291.749764,5336.862885,5241.120997,5290.119381,5290.119381,38655.058824
std,996.396802,989.287755,1003.653913,996.116806,996.116806,74383.681317
min,2553.600098,2585.300049,2252.75,2524.199951,2524.199951,0.0
25%,4882.237305,4935.449829,4835.637451,4887.637451,4887.637451,0.0
50%,5342.224854,5382.899902,5297.350098,5334.349854,5334.349854,0.0
75%,5888.749878,5923.625122,5844.474976,5889.887451,5889.887451,0.0
max,7942.25,7968.25,7939.200195,7954.350098,7954.350098,437000.0


# **Calculate the Technical Indicator**

In [4]:
# Define the parameters
sma_window_size = 3
ema_smoothing_param = 0.2
williams_r_window_size = 3
RSI_window_size = 14
ATR_window_size = 14
VR_window_size = 14

k_param = 10                 # Defining the k parameter for KNN algorithm
training_set_ratio = 0.9     # Defining the training set for KNN algorithm
enable_knn_normalization = True

In [5]:
def TI_calculations(data):
    data = data[['Close']]
    data['returns'] = -data['Close'].diff(-1)
    
    # Remove non-numeric values
    data = data.apply(pd.to_numeric, errors='coerce')
    data.dropna(inplace = True)
    return data

data = TI_calculations(data)
data.tail()

KeyError: "['Date', 'Open', 'High', 'Low'] not found in axis"

# **KNN Search with euclidean distance metrics**

In [None]:
train_set_length = int(training_set_ratio*data.shape[0])     # 90% of the data is used for training

**Algorithm**

1) The data is normalized first.Normalization ensure that the distance between two point for attribute 1 is in the same scale as the distance between any other two points.The distances only will be in the range 0 to 1.
2) The euclidean distances of the current day price,TIs(query) are calculated with respect to all the element training set.
3) The whole dataset is then sorted based on this training set in ascending order and then the first 'k' row are returned

In [None]:
def knn_search(data,query,k,enable_normalization = enable_knn_normalization):
    
    # Scaling the data
    if enable_normalization:
        query = (query - data.min())/(data.max() - data.min())
        temp_data = (data - data.min())/(data.max() - data.min())
    else:
        temp_data = data
        
    neighbors_model = NearestNeighbors(n_neighbors=k, metric='euclidean')
    neighbors_model.fit(temp_data)
    query = query.values.reshape(1,-1) 
    distances, indices = neighbors_model.kneighbors(query)
    
    return data.iloc[indices[0]]

sorted_data = knn_search(data,data.iloc[-1,:],10)

In [None]:
sorted_data

In [None]:
estimated_price_dictionary = {}
actual_price_dictionary = {}
for file in filenames:
    data = pd.read_csv(file)
    data = TI_calculations(data)
    
    actual_close_price = []
    estimated_close_price = []
    for i in tqdm(range(data.shape[0]-train_set_length-1),desc = f"Processing {file} : "):
        data_for_search = data.iloc[i:i+train_set_length,:]
        query = data.iloc[i+train_set_length,:]
    
        sorted_data = knn_search(data_for_search,query,k_param)
        estimated_return = sorted_data['returns'].mean()
    
        estimated_close_price.append(query['Close'] + estimated_return)
        actual_close_price.append(data.iloc[i+train_set_length+1,:]['Close'])

    # Error metrics
    estimated_close_price = np.array(estimated_close_price)
    actual_close_price = np.array(actual_close_price)

    estimated_price_dictionary[file] = estimated_close_price
    actual_price_dictionary[file] = actual_close_price

# **Visualization**

In [None]:
plt.figure(figsize = (12,5))

plt.subplot(131)
plt.title(f"File : {filenames[0]}")
plt.plot(actual_price_dictionary[filenames[0]])
plt.plot(estimated_price_dictionary[filenames[0]])
plt.legend(['actual_close_price','estimated_close_price'])

plt.subplot(132)
plt.title(f"File : {filenames[1]}")
plt.plot(actual_price_dictionary[filenames[1]])
plt.plot(estimated_price_dictionary[filenames[1]])
plt.legend(['actual_close_price','estimated_close_price'])

plt.subplot(133)
plt.title(f"File : {filenames[2]}")
plt.plot(actual_price_dictionary[filenames[2]])
plt.plot(estimated_price_dictionary[filenames[2]])
plt.legend(['actual_close_price','estimated_close_price'])

plt.show()

In [None]:
plt.figure(figsize = (12,5))

plt.subplot(131)
plt.title(f"File : {filenames[0]}")
plt.plot(actual_price_dictionary[filenames[0]] - estimated_price_dictionary[filenames[0]])

plt.subplot(132)
plt.title(f"File : {filenames[1]}")
plt.plot(actual_price_dictionary[filenames[1]] - estimated_price_dictionary[filenames[1]])

plt.subplot(133)
plt.title(f"File : {filenames[2]}")
plt.plot(actual_price_dictionary[filenames[2]] - estimated_price_dictionary[filenames[2]])

plt.show()

In [None]:
for file in filenames:
    error = actual_price_dictionary[file] - estimated_price_dictionary[file]
    print(f"The rmse for {file} with enable_knn_normalization as {enable_knn_normalization} is {np.sqrt(np.mean(error**2))} <br>")

# **Result**

The rmse for sensex_1990_to_2014.csv with enable_knn_normalization as True is 155.16112961199286 <br>
The rmse for nifty_1994_to_2014.csv with enable_knn_normalization as True is 52.510672096852375 <br>
The rmse for nifty_2000_to_2024.csv with enable_knn_normalization as True is 80.39006232530932 <br>

The rmse for sensex_1990_to_2014.csv with enable_knn_normalization as False is 216.03921169398518 <br>
The rmse for nifty_1994_to_2014.csv with enable_knn_normalization as False is 58.886214070856674 <br>
The rmse for nifty_2000_to_2024.csv with enable_knn_normalization as False is 71.89880566902163 <br>

**Only Close price**

The rmse for sensex_1990_to_2014.csv with enable_knn_normalization as True is 69.56409629760876 <br>
The rmse for nifty_1994_to_2014.csv with enable_knn_normalization as True is 38.41585383388505 <br>
The rmse for nifty_2000_to_2024.csv with enable_knn_normalization as True is 50.3351461228365 <br>