# Modeling 
Here, I have tried to use LR to predict Low and high values for a day. Features are one minute stock values from 9:15 to 11:00. 
Right now, we will make a model for only one stock.

In [14]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

import pandas as pd
import numpy as np
import math
import pickle

In [2]:
stock_file_path = r"U:\Study\Durham_AI_Course\Projects\AI Algorithms\dataset\FullDataCsv\TATASTEEL__EQ__NSE__NSE__MINUTE.csv"

stock_df = pd.read_csv(stock_file_path)
stock_df.head()

Unnamed: 0,timestamp,open,high,low,close,volume
0,2017-01-02 09:15:00+05:30,392.05,392.15,390.9,390.9,39465.0
1,2017-01-02 09:16:00+05:30,391.0,391.35,390.5,390.9,19199.0
2,2017-01-02 09:17:00+05:30,390.65,390.9,390.1,390.1,19773.0
3,2017-01-02 09:18:00+05:30,390.0,390.0,389.3,389.75,17499.0
4,2017-01-02 09:19:00+05:30,389.75,390.0,389.55,389.8,13701.0


In [3]:
def get_one_minute_data(file_path, train_steps_size=105, test_steps_size=355):
    stock_df = pd.read_csv(file_path)

    stock_df['timestamp'] = pd.to_datetime(stock_df['timestamp'])
    stock_df_groups = stock_df.groupby(pd.DatetimeIndex(stock_df['timestamp']).normalize())

    one_minute_data = []
    low_high_next_numbers = []

    count = 0

    previous_data = []
    
    for i, item in enumerate(stock_df_groups):
        name, group = item
        
        if int(group.isnull().sum().sum()) > 0:
            count += 1
            continue

        if len(group['close']) < test_steps_size:
            continue
            
        numbers = list(group['close'].iloc[:train_steps_size].values)
        
        scaler = MinMaxScaler()
        new_data = scaler.fit_transform(np.reshape(np.asarray(numbers),(-1,1)).astype(np.float32))
        one_minute_data.append(new_data[:,0])

        next_numbers = group['close'].iloc[train_steps_size:test_steps_size].values.astype(np.float32)

        high = max(next_numbers)
        low = min(next_numbers)
        
        new_low = scaler.transform([[low]]).astype(np.float32)[0]
        new_high = scaler.transform([[high]]).astype(np.float32)[0]

        low_high_next_numbers.append((new_low, new_high, scaler.transform([next_numbers])[0]))

    one_minute_data = np.asarray(one_minute_data).astype(np.float32)
    low_high_next_numbers = np.asarray(low_high_next_numbers)
    return one_minute_data, low_high_next_numbers

# Train a Linear Regression Model

In [4]:
train_steps_size = 105
features = 1
batch_size = 100

In [5]:
one_minute_data, low_high_next_numbers = get_one_minute_data(stock_file_path, train_steps_size=train_steps_size) 

In [6]:
train_x, test_x, train_y, test_y = train_test_split(one_minute_data, low_high_next_numbers, random_state=1)

train_low_values = train_y[:,0].astype(np.float32)
train_high_values = train_y[:,1].astype(np.float32)

test_low_values = test_y[:,0].astype(np.float32)
test_high_values = test_y[:,1].astype(np.float32)

In [7]:
train_x.shape

(662, 105)

In [8]:
test_x.shape

(221, 105)

In [9]:
train_low_values.shape

(662,)

In [10]:
train_high_values.shape

(662,)

In [11]:
low_model = LinearRegression()
low_model.fit(train_x, train_low_values)

pred_low = low_model.predict(test_x)
test_score = math.sqrt(mean_squared_error(test_low_values, pred_low))
print("Test Score: %.2f RMSE" % test_score)

Test Score: 0.69 RMSE


In [12]:
high_model = LinearRegression()
high_model.fit(train_x, train_high_values)

pred_high = high_model.predict(test_x)
test_score = math.sqrt(mean_squared_error(test_high_values, pred_high))
print("Test Score: %.2f RMSE" % test_score)

Test Score: 0.69 RMSE


In [13]:
profit=0.5
stoploss=-0.5
test_next_numbers = low_high_next_numbers[:, 2]

test_samples_size = len(test_x)

pred_y_low = low_model.predict(test_x)
pred_y_high = high_model.predict(test_x)

count = 0
success = 0
fail = 0

for i in range(test_samples_size):
    true_low = test_low_values[i]
    true_high = test_high_values[i]

    predicted_low = pred_y_low[i]
    predicted_high = pred_y_low[i]

    buy_value = predicted_low

    if true_low <= buy_value <= true_high:
        count += 1

        sell_value = buy_value + (buy_value * profit) / 100
        stop_loss_value = buy_value + (buy_value * stoploss) / 100

        bought = False

        for value in test_next_numbers[i]:
            if bought:
                if value <= stop_loss_value:
                    fail += 1
                    break
                elif value >= sell_value:
                    success += 1
                    break
            else:
                if value <= buy_value:
                    bought = True

print(test_samples_size)
print("Possible Trades:", count)
print("Successful Trades:", success)
print("Fail Trades:", fail)

win_ratio = round(success / (success + fail), 4)

print("Win Ratio:", win_ratio)

221
Possible Trades: 90
Successful Trades: 13
Fail Trades: 32
Win Ratio: 0.2889


In [15]:
filename = 'lr_low_model.md'
pickle.dump(low_model, open(filename, 'wb'))
filename = 'lr_high_model.md'
pickle.dump(high_model, open(filename, 'wb'))