In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/stocks/gr500182.csv
/kaggle/input/stocks/gr500209.csv
/kaggle/input/stocks/gr500680.csv
/kaggle/input/stocks/gr530965.csv
/kaggle/input/stocks/gr532174.csv
/kaggle/input/stocks/gr532210.csv
/kaggle/input/stocks/gr532540.csv
/kaggle/input/stocks/gr500325.csv
/kaggle/input/stocks/gr500112.csv
/kaggle/input/stocks/gr500180.csv
/kaggle/input/stocks/gr507685.csv


In [18]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics
from math import sqrt
from sklearn.neighbors import DistanceMetric
from sklearn.model_selection import GridSearchCV
import prettytable


In [8]:
def pre_process_data(data,null_threshold):
    """
    Drops Date and Unix Date columns from the data.
    Drops the columns which has null values more than specified null_threshold.
    Replaces infinite values with NAN.
    Drops the rows which has null values.

    Parameters
    ----------
    data : dataframe

    null_threshold : numeric
        numeric value describing the amount of null values that can be present.

    Returns
    -------
    data : dataframe
        an updated dataframe after performing all the opertaions.
    """
    
    data.drop(columns=['Unix Date','Date'],axis=1,inplace=True)
    total = data.shape[0]
    for col in data.columns:
        if null_threshold * total / 100 < data[col].isnull().sum():
            data.drop(columns=[col],axis=1,inplace=True)
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    data = data.apply(pd.to_numeric,errors='coerce')
    data.dropna(axis=0,inplace=True)
    return data

In [9]:
def dependent_column(data,column):
    """
    Removes all the Next Day columns.
    Removes all the non Growth Rate Columns (GR)
    add the predictor column to list of columns.

    Parameters
    ----------
    data : dataframe

    column : string
        name of the predictor column 

    Returns
    -------
    data : dataframe
        an updated dataframe after performing all the opertaions.
    column : string
        name of the predictor column
    """
    cols = [col for col in data.columns if "next" not in col.lower() and col.lower().endswith("gr")]
    cols.append(column)
    data = data[cols]
    return (data,column)

In [56]:
def best_parameters(X,Y):
    params = {'n_neighbors':np.arange(1,105,5), 'weights':['uniform', 'distance'], 'metric':['euclidean', 'manhattan']}
    knn = KNeighborsRegressor()    
    model = GridSearchCV(knn, params)
    model.fit(X,Y)
    k = model.best_params_['n_neighbors']
    params = {'n_neighbors':np.arange(k-5, k+5), 'weights':['uniform', 'distance'], 'metric':['euclidean', 'manhattan']}
    knn = KNeighborsRegressor()
    model = GridSearchCV(knn, params)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
    model.fit(X_train,y_train)
    return model.best_params_

In [78]:
def k_nearest_neighbours(X,Y):
    params = best_parameters(X,Y)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
    knn = KNeighborsRegressor(**params)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    
    rmse = sqrt(metrics.mean_squared_error(y_test, y_pred))
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    r2 = metrics.r2_score(y_test, y_pred)
    c = 0
    for a,b in zip(y_test, y_pred):
        if a*b >= 0:
            c += 1
    direction = c/len(y_test)
    myres =  {"root_mean_squared_error":rmse,"mean_absolute_error":mae,"mean_squared_error":mse,"rsquared_adj":r2}
    myres.update(params)
    myres.update({"direction":direction})
    return myres

In [79]:
%%time
columns =['security id','metric','n_neighbors','weights','root_mean_squared_error','mean_absolute_error','mean_squared_error','rsquared_adj','direction']
mydf = pd.DataFrame(columns=columns)
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        filepath = os.path.join(dirname, filename)
        df = pd.read_csv(filepath)
        df = pre_process_data(df,60)
        column = "Next Day Close Price GR"
        (df,column) = dependent_column(df,column)
        X = df.drop(columns=[column])
        Y = df[column]
        result = k_nearest_neighbours(X,Y)
        result.update({"security id":filename[2:8]})
        mydf = mydf.append(result,ignore_index=True)
mydf.to_csv(os.path.join(os.getcwd(),"best_knn"+".csv"),index=None)

{'metric': 'manhattan', 'n_neighbors': 74, 'weights': 'uniform'}
{'metric': 'manhattan', 'n_neighbors': 105, 'weights': 'uniform'}
{'metric': 'manhattan', 'n_neighbors': 104, 'weights': 'uniform'}
{'metric': 'euclidean', 'n_neighbors': 104, 'weights': 'uniform'}
{'metric': 'euclidean', 'n_neighbors': 88, 'weights': 'uniform'}
{'metric': 'euclidean', 'n_neighbors': 103, 'weights': 'uniform'}
{'metric': 'manhattan', 'n_neighbors': 96, 'weights': 'distance'}
{'metric': 'manhattan', 'n_neighbors': 95, 'weights': 'uniform'}
{'metric': 'manhattan', 'n_neighbors': 85, 'weights': 'uniform'}
{'metric': 'manhattan', 'n_neighbors': 100, 'weights': 'uniform'}
{'metric': 'manhattan', 'n_neighbors': 102, 'weights': 'uniform'}
CPU times: user 4min 44s, sys: 433 ms, total: 4min 44s
Wall time: 4min 46s


In [83]:
mydf

Unnamed: 0,security id,metric,n_neighbors,weights,root_mean_squared_error,mean_absolute_error,mean_squared_error,rsquared_adj,direction
0,500182,manhattan,74,uniform,0.019721,0.013868,0.000389,-0.009685,0.493333
1,500209,manhattan,105,uniform,0.019099,0.013027,0.000365,-0.020573,0.506118
2,500680,manhattan,104,uniform,0.017209,0.011848,0.000296,-0.009638,0.507955
3,530965,euclidean,104,uniform,0.056031,0.01881,0.003139,0.002429,0.505028
4,532174,euclidean,88,uniform,0.137321,0.023119,0.018857,-0.002416,0.513363
5,532210,euclidean,103,uniform,0.023356,0.015631,0.000546,-0.011095,0.53341
6,532540,manhattan,96,distance,0.053327,0.015852,0.002844,-0.009404,0.516827
7,500325,manhattan,95,uniform,0.021025,0.01496,0.000442,-0.018113,0.481646
8,500112,manhattan,85,uniform,0.292628,0.027649,0.085631,-0.000589,0.482183
9,500180,manhattan,100,uniform,0.033798,0.013469,0.001142,0.002034,0.512821


In [89]:
table = prettytable.PrettyTable()
table.title = "best knn"
table.field_names = mydf.columns
for _,row in mydf.iterrows():
    row = [round(r,6) if isinstance(r,(float,int)) else r for r in row]
    table.add_row(row)
print(table)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|                                                                                  best knn                                                                                 |
+-------------+-----------+-------------+----------+-------------------------+----------------------+------------------------+------------------------+---------------------+
| security id |   metric  | n_neighbors | weights  | root_mean_squared_error | mean_absolute_error  |   mean_squared_error   |      rsquared_adj      |      direction      |
+-------------+-----------+-------------+----------+-------------------------+----------------------+------------------------+------------------------+---------------------+
|    500182   | manhattan |      74     | uniform  |   0.01972119720137882   | 0.013867539013617439 | 0.0003889256190556718  | -0.