In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics
from math import sqrt
from sklearn.neighbors import DistanceMetric
from sklearn.model_selection import GridSearchCV
import prettytable
from prettytable import PrettyTable

import statsmodels.api as sm

import warnings; warnings.simplefilter('ignore')

from sklearn.preprocessing import StandardScaler
from datetime import datetime, timedelta
from sklearn.feature_selection import SequentialFeatureSelector

In [2]:
path = os.getcwd()
path

'C:\\Users\\venu\\Desktop\\Stock Market Analysis'

# Pre-process the data

In [3]:
def pre_process_data(data,null_threshold):
    """
    Drops Date and Unix Date columns from the data.
    Drops the columns which has null values more than specified null_threshold.
    Replaces infinite values with NAN.
    Drops the rows which has null values.

    Parameters
    ----------
    data : dataframe

    null_threshold : numeric
        numeric value describing the amount of null values that can be present.

    Returns
    -------
    data : dataframe
        an updated dataframe after performing all the opertaions.
    """
    
    data.drop(columns=['Unixtime','Date'],axis=1,inplace=True)
    total = data.shape[0]
    for col in data.columns:
        if null_threshold * total / 100 < data[col].isnull().sum():
            data.drop(columns=[col],axis=1,inplace=True)
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    data = data.apply(pd.to_numeric,errors='coerce')
    data.dropna(axis=0,inplace=True)
    return data


# Necessary Columns

In [4]:
def data_with_necessary_columns(data):
    necessary_columns = ['Dividend Value', '% Return of Company', '% Return of SP500', 'Next Day Open Price', 'Next Day High Price',
                     'Next Day Low Price', 'Next Day Close Price', 'Open Price GR', 'High Price GR', 'Low Price GR', 'Close Price GR',
                     'Next Day Open Price GR', 'Next Day High Price GR', 'Next Day Low Price GR', 'Next Day Close Price GR',
                     'WAP GR', 'No.of Shares GR', 'No. of Trades GR', 'Total Turnover (Rs.) GR', 'Deliverable Quantity GR',
                     '% Deli. Qty to Traded Qty GR', 'Spread High-Low GR', 'Spread Close-Open GR', 'Alpha GR', 'Beta GR',
                     'Sequential Increase %', 'Sequential Decrease %', 'Max Inc % in 90 days', 'Max Dec % in 90 days',
                     'Min Inc % in 90 days', 'Min Dec % in 90 days', 'Avg Inc % in 90 days', 'Avg Dec % in 90 days',
                     'Max Inc % in 180 days', 'Max Dec % in 180 days', 'Min Inc % in 180 days', 'Min Dec % in 180 days',
                     'Avg Inc % in 180 days', 'Avg Dec % in 180 days', 'Max Inc % in 365 days', 'Max Dec % in 365 days',
                     'Min Inc % in 365 days', 'Min Dec % in 365 days', 'Avg Inc % in 365 days', 'Avg Dec % in 365 days',
                     'Revenue GR', 'Dividend Value GR', 'Income GR', 'Expenditure GR', 'Net Profit GR', 'EPS GR',
                     'CP % LV 7 days', 'CP % HV 7 days', 'CP % BA 7 days', 'CP % LV 30 days', 'CP % HV 30 days', 'CP % BA 30 days',
                     'CP % LV 90 days', 'CP % HV 90 days', 'CP % BA 90 days', 'CP % LV 180 days', 'CP % HV 180 days',
                     'CP % BA 180 days', 'CP % LV 365 days', 'CP % HV 365 days', 'CP % BA 365 days', 'LowerBandInLast1Months',
                     'UpperBandInLast1Months', 'LowerBandInNext1Months', 'UpperBandInNext1Months', 'LowerBandInLast3Months',
                     'UpperBandInLast3Months', 'LowerBandInNext3Months', 'UpperBandInNext3Months', 'LowerBandInLast6Months',
                     'UpperBandInLast6Months', 'LowerBandInNext6Months', 'UpperBandInNext6Months', 'LowerBandInLast9Months',
                     'UpperBandInLast9Months', 'LowerBandInNext9Months', 'UpperBandInNext9Months', 'LowerBandInLast12Months',
                     'UpperBandInLast12Months', 'LowerBandInNext12Months', 'UpperBandInNext12Months', 'LowerBandInLast24Months',
                     'UpperBandInLast24Months', 'LowerBandInNext24Months', 'UpperBandInNext24Months']
    
    n_cols = []
    d_cols = list(data.columns)
    for col in necessary_columns:
        if (col in d_cols):
            n_cols.append(col)
            
    data = data[n_cols]
    return data

In [5]:
def remove_columns_startswithnext_UBLB(data):
    cols = [col for col in data.columns if (not (col.lower().startswith("next")))]
    data = data[cols]
    return data

In [6]:
def remove_ycolumn_columns(data, column):
    cols = [col for col in data.columns]
    cols.remove(column)
    cols.append(column)
    data = data[cols]
    return data

# Removing columns based on the dependent column

In [7]:
def dependent_column(data,column):
    """
    Removes all the Next Day columns.
    Removes all the non Growth Rate Columns (GR)
    add the predictor column to list of columns.

    Parameters
    ----------
    data : dataframe

    column : string
        name of the predictor column 

    Returns
    -------
    data : dataframe
        an updated dataframe after performing all the opertaions.
    column : string
        name of the predictor column
    """
    cols = [col for col in data.columns if "next" not in col.lower() and col.lower().endswith("gr")]
    cols.append(column)
    data = data[cols]
    return (data,column)

# For predicting UB, LB percentage columns

In [8]:
def remove_next_columns_UBLB(data):
    cols = [col for col in data.columns if ("next" not in col.lower())]
    data = data[cols]
    return data

In [9]:
def remove_CP_columns_UBLB(data):
    cols = [col for col in data.columns if ("cp" not in col.lower())]
    data = data[cols]
    return data

In [10]:
def remove_LowerBand_columns_UBLB(data, column):
    cols = [col for col in data.columns if ("lv" not in col.lower() and "low" not in col.lower())]
    cols.append(column)
    data = data[cols]
    return (data, column)

In [11]:
def remove_UpperBand_columns_UBLB(data, column):
    cols = [col for col in data.columns if ("hv" not in col.lower() and "upper" not in col.lower())]
    cols.append(column)
    data = data[cols]
    return (data, column)

# Feature Selection Methods

In [11]:
def forward_selection_SFS(n_neighbors, n_features_to_select, X, y):
    knn = KNeighborsRegressor(n_neighbors = n_neighbors)  
    sfs = SequentialFeatureSelector(knn, n_features_to_select = n_features_to_select)
    sfs.fit(X, y)
    sup = sfs.get_support()
    cols = list(X.columns)

    ff = []
    for i in range(len(sup)):
        if (sup[i]):
            ff.append(cols[i])
            
    return ff
    

KeyboardInterrupt: 

In [None]:
def backward_selection_SFS(n_neighbors, n_features_to_select, X, y):
    knn = KNeighborsRegressor(n_neighbors = n_neighbors)  
    sfs = SequentialFeatureSelector(knn, n_features_to_select = n_features_to_select, direction = 'backward')
    sfs.fit(X, y)
    sup = sfs.get_support()
    cols = list(X.columns)

    bf = []
    for i in range(len(sup)):
        if (sup[i]):
            bf.append(cols[i])

    return bf

In [None]:
# df_knn = pd.read_csv(os.path.join(path,"Data\Stock1\\" + "UBLBPASTNEXTgr500112.csv"))
# df1 = pre_process_data(df_knn, 60)
# df1 = data_with_necessary_columns(df1)
# df1 = remove_columns_startswithnext_UBLB(df1)
# column = "LowerBandInNext1Months"
# df1 = remove_ycolumn_columns(df1, column)
# print(len(df1.columns))

# X = df1[df1.columns[:-1]]
# Y = df1[column]


# fs = forward_selection_SFS(3, (df1.shape[1] // 3), X, Y)
# print(fs, len(fs))


In [None]:
# bs = backward_selection_SFS(3, (df1.shape[1] // 3), X, Y)
# print(bs, len(bs))

In [None]:
def forward_selection(data, target, significance_level=0.05):
    initial_features = data.columns.tolist()
    best_features = []
    while (len(initial_features)>0):
        remaining_features = list(set(initial_features)-set(best_features))
        new_pval = pd.Series(index=remaining_features)
        for new_column in remaining_features:
            model = sm.OLS(target, sm.add_constant(data[best_features+[new_column]]).astype(float)).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_p_value = new_pval.min()
        if(min_p_value<significance_level):
            best_features.append(new_pval.idxmin())
        else:
            break
    return best_features

In [None]:
def backward_elimination(data, target,significance_level = 0.05):
    features = data.columns.tolist()
    while(len(features)>0):
        features_with_constant = sm.add_constant(data[features]).astype(float)
        p_values = sm.OLS(target, features_with_constant).fit().pvalues[1:]
        max_p_value = p_values.max()
        if(max_p_value >= significance_level):
            excluded_feature = p_values.idxmax()
            features.remove(excluded_feature)
        else:
            break 
    return features

# KNN Model

In [None]:
def best_parameters(X,Y):
    params = {'n_neighbors':np.arange(1,105,5), 'weights':['uniform', 'distance'], 'metric':['euclidean', 'manhattan']}
    knn = KNeighborsRegressor()
    model = GridSearchCV(knn, params)
    model.fit(X,Y)
    k = model.best_params_['n_neighbors']
    params = {'n_neighbors':np.arange(k-5, k+5), 'weights':['uniform', 'distance'], 'metric':['euclidean', 'manhattan']}
    knn = KNeighborsRegressor()
    model = GridSearchCV(knn, params)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
    model.fit(X_train,y_train)
    return model.best_params_

In [None]:
def k_nearest_neighbours(X,Y, method, value, name, column):
    params = best_parameters(X,Y)
    
#     scaler = StandardScaler()

#     X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
    
#     X_train = scaler.fit_transform(X_train)
#     X_train = pd.DataFrame(X_train)
    
#     X_test = scaler.fit_transform(X_test)
#     X_test = pd.DataFrame(X_test)
    
#     y_train = scaler.fit_transform(y_train.values.reshape(-1, 1))
#     y_train = pd.DataFrame(y_train)

    X_test = X[ : int(X.shape[0] * 0.3)]
    X_train = X[int(X.shape[0] * 0.3) : ]
    y_test = Y[ : int(X.shape[0] * 0.3)]
    y_train = Y[int(X.shape[0] * 0.3) :]

    knn = KNeighborsRegressor(**params)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    
#     y_pred = scaler.inverse_transform(y_pred)
    
    rmse = sqrt(metrics.mean_squared_error(y_test, y_pred))
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    r2 = metrics.r2_score(y_test, y_pred)
    
    pred_actual = pd.DataFrame(list(zip(y_pred, y_test)), 
                   columns =['Predicted Values', 'Actual Values'])
    pred_actual.to_csv("C:\\Users\\venu\\Desktop\\Stock Market Analysis\\Data\\NextUBLB_Results\\" + name[14 : 20] + "_KNN_Regression_" + "FI_" + method + "_" + column + "_" + str(value) + ".csv" , index = False)
    
    
    myres =  {"RMSE":rmse,"MAE":mae,"MSE":mse}
    myres.update(params)
    myres.update({"r2_score" : r2})
    
    return myres

# Finding results from each set of important features

In [None]:
def fit_KNN(df, column, method, value, name, results):
    print("KNN Model fitted using columns obtained from feature importance using " + method + " : ")
    X = df[df.columns[:-1]]
    Y = df[column]
    
    
    model_result = k_nearest_neighbours(X,Y, method, value, name, column)
    
    print("R2_Score ----> ", model_result["r2_score"])
    
    results["KNN_Regression_FI_" + method + "_" + str(value)] = {}
    results["KNN_Regression_FI_" + method + "_" + str(value)]['r2_score'] = model_result['r2_score']
    results["KNN_Regression_FI_" + method + "_" + str(value)]['RMSE'] = model_result['RMSE']
    results["KNN_Regression_FI_" + method + "_" + str(value)]['MAE'] = model_result['MAE']
    results["KNN_Regression_FI_" + method + "_" + str(value)]['MSE'] = model_result['MSE']
    results["KNN_Regression_FI_" + method + "_" + str(value)]['n_neighbors'] = model_result['n_neighbors']
    results["KNN_Regression_FI_" + method + "_" + str(value)]['weights'] = model_result['weights']
    results["KNN_Regression_FI_" + method + "_" + str(value)]['metric'] = model_result['metric']
    
    
    
    create_pretty_table(name , "KNN_Regression", method + " " + value, model_result, column)

In [None]:
def get_results_from_FI_ForwardSelection(df1, name, column, results):
    print("Features Importance using Forward Selection Method")
    print("*****************************************************************************************")
    method = "ForwardSelection"
    X = df1[df1.columns[:-1]]
    Y = df1[column]
    forward_features = forward_selection_SFS(3, (df1.shape[1] // 3), X, Y)
    print("Features obtained from Forward Selection method : ") 
    print("--------------------------------------")
    print(forward_features)
    if (len(forward_features) != 0):
        forward_features.append(column)
        df_fs = df1[forward_features]
        fit_KNN(df_fs, column, method, '', name, results)
    print("*****************************************************************************************")

In [None]:
def get_results_from_FI_BackwardElimination(df1, name, column, results):
    print("Features Importance using Backward Elimination Method")
    print("*****************************************************************************************")
    method = "BackwardElimination"
    X = df1[df1.columns[:-1]]
    Y = df1[column]
    backward_features = backward_selection_SFS(3, (df1.shape[1] // 3), X, Y)
    print("Features obtained from Backward Elimination method : ") 
    print("--------------------------------------")
    print(backward_features)
    if (len(backward_features) != 0):
        backward_features.append(column)
        df_be = df1[backward_features]
        fit_KNN(df_be, column, method, '', name, results)
    print("*****************************************************************************************")

In [None]:
def get_results_from_FI_AllFeatures(df1, name, column, results):
    print("All Features are considered : ")
    print("*****************************************************************************************")
    method = "AllFeaturesConsideration"
    X = df1[df1.columns[:-1]]
    Y = df1[column]
    all_features = list(X.columns)
    print("All Features are --->>") 
    print("--------------------------------------")
    print(all_features)
    if (len(all_features) != 0):
        all_features.append(column)
        df_all = df1[all_features]
        fit_KNN(df_all, column, method, '', name, results)
    print("*****************************************************************************************")

In [None]:
def get_results_from_each_set(data, name, final_df, column):
    df = pre_process_data(data, 60)
    
    df1 = data_with_necessary_columns(df)
    df1 = remove_columns_startswithnext_UBLB(df1)
    df1 = remove_ycolumn_columns(df1, column)
    
    results = {}
    
    get_results_from_FI_AllFeatures(df1, name, column, results)
    get_results_from_FI_ForwardSelection(df1, name, column, results)
#     get_results_from_FI_BackwardElimination(df1, name, column, results)
    
    
    sorted_results = sorted(results.items(), key=lambda item: item[1]['r2_score'])
    max_row = {'Company' : name[14 : 20] + "-" + companies[name[14 : 20]], 'Y-Column' : column, 'Model' : 'KNN-Regression', 'Method' : sorted_results[-1][0],
               'RMSE' : sorted_results[-1][1]['RMSE'], 'MAE' : sorted_results[-1][1]['MAE'], 'MSE' : sorted_results[-1][1]['MSE'],
              'n_neighbors' : sorted_results[-1][1]['n_neighbors'],'weights' : sorted_results[-1][1]['weights'],
               'metric' : sorted_results[-1][1]['metric'], 'r2_score' : sorted_results[-1][1]['r2_score']}
    final_df = final_df.append(max_row, ignore_index = True)
    print("Maximum correct direction values are obtained for {} with r2_score of {}.".format(sorted_results[-1][0], sorted_results[-1][1]['r2_score']))
    return final_df

# Process of getting results

In [None]:
def create_pretty_table(name, model, method, result, column):
    values = [name[14 : 20] + "-" + companies[name[14 : 20]], column, method] + [round(v, 6) if (not isinstance(v, str)) else v for k,v in result.items()]
    tables[model].add_row(values)
    tables[model].title = model

In [None]:
columns = ['Company','Y-Column','Method', 'RMSE','MAE','MSE','n_neighbors','weights','metric', 'r2_score']

In [None]:
df_equity = pd.read_csv("C:\\Users\\venu\\Desktop\\Stock Market Analysis\\Data\\Equity.csv")
security_numbers = df_equity["Security Code"].tolist()
security_names = df_equity["Security Name"].tolist()
companies = {str(k) : v for (k, v) in list(zip(security_numbers, security_names))}
companies["542602"] = "Embassy Office Parks REIT"
companies["500112"] = "State Bank of India"
# companies["542602"]

In [None]:
models = ["KNN_Regression"]
tables = {model:PrettyTable() for model in models}
for name,table in tables.items():
    table.field_names = columns

In [None]:
final_columns = ['Company','Y-Column','Model','Method', 'RMSE','MAE','MSE','n_neighbors','weights','metric', 'r2_score']

In [None]:
# final_df = pd.read_csv("C:\\Users\\venu\\Desktop\\Stock Market Analysis\\Data\\UBLB_Results\\Results_KNN.csv")
# final_df

In [None]:
final_df = pd.DataFrame(columns = final_columns)
final_df

In [None]:
%%time

comp = ["500023", "500027", "500028", "500112"]


# c = 0
for filename in os.listdir(os.path.join(path,"Data/Stock1")):
#     if (c == 31):
#         break
    if filename.startswith("UBLBPASTNEXTgr"):
#         c += 1
        name = os.path.join(path, "Data\Stock1\\" + filename).split("\\")[-1]
        stock = name[14 : 20]
        print(stock)
        if(stock not in comp):
            print("continue")
            continue
            
        fd_df = pd.DataFrame(columns = final_df.columns)
        y_columns = ["LowerBandInNext1Months", "LowerBandInNext3Months", "LowerBandInNext6Months",
                    "LowerBandInNext9Months", "LowerBandInNext12Months", "LowerBandInNext24Months", 
                     "UpperBandInNext1Months", "UpperBandInNext3Months", "UpperBandInNext6Months",
                     "UpperBandInNext9Months", "UpperBandInNext12Months", "UpperBandInNext24Months"]
        for i in range(len(y_columns)):
            col = y_columns[i]
            df_knn = pd.read_csv(os.path.join(path,"Data\Stock1\\" + filename))
            
            value = 0
            if ("24" in col):
                value = 24
            elif ("12" in col):
                value = 12
            elif ("9" in col):
                value = 9
            elif ("6" in col):
                value = 6
            elif ("3" in col):
                value = 3
            else:
                value = 1
            
            
            df_knn["Date"] = pd.to_datetime(df_knn["Date"])
            head_date = df_knn.loc[0, "Date"]
            end_date = head_date - timedelta(days = value * 30)
            tail = df_knn.shape[0] - 1
            tail_date = df_knn.loc[tail, "Date"]
            start_date = tail_date + timedelta(days = value * 30)
            mask = (df_knn["Date"] >= start_date) & (df_knn["Date"] <= end_date)
            df_kn = df_knn.loc[mask]
            
            print("For stock : ", stock, "and Y-Column is :", col)
            print("#################################################################################################################")
            f_df = get_results_from_each_set(df_kn, name, fd_df, col)
            final_df = final_df.append(f_df, ignore_index = True)
            print("#################################################################################################################")
#             break
        
final_df = final_df.sort_values(by = ['Company', 'Y-Column','r2_score'], ascending = [True, True, False])
final_df.to_csv('C:\\Users\\venu\\Desktop\\Stock Market Analysis\\Data\\NextUBLB_Results\\Results_KNN_30.csv', index = False) 

In [None]:
for name,table in tables.items():
    print(table)

In [None]:
final_df

In [None]:
# final_df = final_df.sort_values(by = ['Company', 'Y-Column','r2_score'], ascending = [True, True, False])
# final_df.to_csv('C:\\Users\\venu\\Desktop\\Stock Market Analysis\\Data\\NextUBLB_Results\\Results_KNN_30.csv', index = False) 