# Predicting Brazilian Stock Market

## Libraries Used

In [1]:
from datetime import datetime as dt
import pandas as pd
from pandas_datareader import data as pdr
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LinearRegression
from sklearn import metrics

## Reading the Dataframe

In [2]:
BRstocks = pd.read_csv('C:/Users/pablo/Alura/Portifolio/BRstocks.csv')
BRstocks.index = BRstocks['Date']
BRstocks = BRstocks.drop(['Date'], axis=1)
BRstocks

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Stock
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-10-28,19.760000,18.920000,19.299999,19.200001,6342600.0,AALR3
2016-10-31,19.200001,17.510000,19.190001,18.059999,2523300.0,AALR3
2016-11-01,18.379999,17.160000,18.059999,17.900000,996200.0,AALR3
2016-11-03,18.240000,17.309999,18.000000,17.990000,621000.0,AALR3
2016-11-04,18.120001,17.709999,17.950001,17.750000,389800.0,AALR3
...,...,...,...,...,...,...
2021-11-29,21.400000,20.850000,21.200001,20.920000,1620800.0,YDUQ3
2021-11-30,22.280001,20.660000,21.400000,21.830000,4438500.0,YDUQ3
2021-12-01,23.040001,21.830000,21.959999,21.889999,3950300.0,YDUQ3
2021-12-02,23.260000,21.709999,22.219999,23.160000,3929400.0,YDUQ3


## Upadating the dataframe

### Selecting the time

In [3]:
end = dt.now()
start = BRstocks.index.max()
start = dt.strptime(start, '%Y-%m-%d')
start

datetime.datetime(2021, 12, 3, 0, 0)

In [4]:
stocks_names = pd.read_excel('../Dados/lala.xlsx')
stock_list = stocks_names['Papel']
names = stock_list.dropna().apply(lambda ticker: ticker + '.SA')

names

0      AALR3.SA
1      ABCB4.SA
2      ABEV3.SA
3      ADHM3.SA
4      AERI3.SA
         ...   
473    WHRL4.SA
474    WIZS3.SA
475    WLMM3.SA
476    WLMM4.SA
477    YDUQ3.SA
Name: Papel, Length: 474, dtype: object

### Actually updating the BRstocks

In [5]:
for name in names:
    api_answer = pdr.get_data_yahoo(name, start, end)
    api_answer['Stock'] = name[0:-3]
    api_answer = api_answer.drop(['Adj Close'], axis=1)
    BRstocks = BRstocks.append(api_answer, False)

In [6]:
BRstocks

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Stock
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-10-28,19.760000,18.920000,19.299999,19.200001,6342600.0,AALR3
2016-10-31,19.200001,17.510000,19.190001,18.059999,2523300.0,AALR3
2016-11-01,18.379999,17.160000,18.059999,17.900000,996200.0,AALR3
2016-11-03,18.240000,17.309999,18.000000,17.990000,621000.0,AALR3
2016-11-04,18.120001,17.709999,17.950001,17.750000,389800.0,AALR3
...,...,...,...,...,...,...
2021-12-07 00:00:00,24.740000,23.600000,24.629999,23.690001,2497900.0,YDUQ3
2021-12-08 00:00:00,24.639999,23.520000,23.959999,24.410000,6492500.0,YDUQ3
2021-12-09 00:00:00,24.320000,23.389999,23.980000,23.510000,1720300.0,YDUQ3
2021-12-10 00:00:00,24.600000,23.750000,23.830000,24.040001,1651200.0,YDUQ3


In [7]:
BRstocks.corr()

Unnamed: 0,High,Low,Open,Close,Volume
High,1.0,0.999828,0.99986,0.99987,-0.001216
Low,0.999828,1.0,0.999976,0.999974,-0.001213
Open,0.99986,0.999976,1.0,0.999964,-0.001214
Close,0.99987,0.999974,0.999964,1.0,-0.001214
Volume,-0.001216,-0.001213,-0.001214,-0.001214,1.0


## Binary Classification prediction

### Using a sample

In [8]:
WEGE = BRstocks.loc[BRstocks['Stock'] == 'WEGE3']
WEGE = WEGE.drop(['Stock'], axis=1)
WEGE

Unnamed: 0_level_0,High,Low,Open,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-01-03,0.170118,0.168639,0.168639,0.168639,378560.0
2000-01-04,0.164201,0.162721,0.164201,0.162721,47320.0
2000-01-05,0.162721,0.162721,0.162721,0.162721,0.0
2000-01-06,0.147928,0.147928,0.147928,0.147928,250120.0
2000-01-07,0.149408,0.147928,0.149408,0.147928,135200.0
...,...,...,...,...,...
2021-12-07 00:00:00,34.560001,33.250000,33.320000,34.560001,12047300.0
2021-12-08 00:00:00,36.110001,34.209999,34.680000,36.040001,11021000.0
2021-12-09 00:00:00,37.110001,35.330002,35.750000,36.500000,16902100.0
2021-12-10 00:00:00,37.090000,35.650002,36.840000,35.799999,5805900.0


In [9]:
WEGE['Close2'] = WEGE.Close.shift(periods=1) - WEGE.Close
WEGE

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Close2
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-01-03,0.170118,0.168639,0.168639,0.168639,378560.0,
2000-01-04,0.164201,0.162721,0.164201,0.162721,47320.0,0.005918
2000-01-05,0.162721,0.162721,0.162721,0.162721,0.0,0.000000
2000-01-06,0.147928,0.147928,0.147928,0.147928,250120.0,0.014793
2000-01-07,0.149408,0.147928,0.149408,0.147928,135200.0,0.000000
...,...,...,...,...,...,...
2021-12-07 00:00:00,34.560001,33.250000,33.320000,34.560001,12047300.0,-1.550003
2021-12-08 00:00:00,36.110001,34.209999,34.680000,36.040001,11021000.0,-1.480000
2021-12-09 00:00:00,37.110001,35.330002,35.750000,36.500000,16902100.0,-0.459999
2021-12-10 00:00:00,37.090000,35.650002,36.840000,35.799999,5805900.0,0.700001


In [10]:
WEGE.loc[WEGE['Close2'] < 0, 'Binary_Close'] = 0
WEGE.loc[WEGE['Close2'] >= 0, 'Binary_Close'] = 1
WEGE = WEGE[1:]
WEGE

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Close2,Binary_Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-04,0.164201,0.162721,0.164201,0.162721,47320.0,0.005918,1.0
2000-01-05,0.162721,0.162721,0.162721,0.162721,0.0,0.000000,1.0
2000-01-06,0.147928,0.147928,0.147928,0.147928,250120.0,0.014793,1.0
2000-01-07,0.149408,0.147928,0.149408,0.147928,135200.0,0.000000,1.0
2000-01-10,0.147928,0.147928,0.147928,0.147928,0.0,0.000000,1.0
...,...,...,...,...,...,...,...
2021-12-07 00:00:00,34.560001,33.250000,33.320000,34.560001,12047300.0,-1.550003,0.0
2021-12-08 00:00:00,36.110001,34.209999,34.680000,36.040001,11021000.0,-1.480000,0.0
2021-12-09 00:00:00,37.110001,35.330002,35.750000,36.500000,16902100.0,-0.459999,0.0
2021-12-10 00:00:00,37.090000,35.650002,36.840000,35.799999,5805900.0,0.700001,1.0


In [11]:
x = WEGE[['High', 'Low', 'Open', 'Volume']]
y = WEGE['Binary_Close']
y = y.astype('int')

In [13]:
SEED = 20

train_x, test_x, train_y, test_y = train_test_split(x, y, 
                                                        random_state = SEED,
                                                        test_size = 0.25,
                                                        stratify = y)
print("Training with %d elements and Testing with %d elements" % (len(train_x)*6, len(test_x)*6))

modelo = LinearSVC(dual=False)
modelo.fit(train_x, train_y)
previsoes = modelo.predict(test_x)

accuracy = accuracy_score(test_y, previsoes) * 100
print("The accuracy %.2f%%" % accuracy)

Training with 24756 elements and Testing with 8256 elements
The accuracy 67.15%


### Trying in the intire database

In [14]:
data = BRstocks
data

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Stock
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-10-28,19.760000,18.920000,19.299999,19.200001,6342600.0,AALR3
2016-10-31,19.200001,17.510000,19.190001,18.059999,2523300.0,AALR3
2016-11-01,18.379999,17.160000,18.059999,17.900000,996200.0,AALR3
2016-11-03,18.240000,17.309999,18.000000,17.990000,621000.0,AALR3
2016-11-04,18.120001,17.709999,17.950001,17.750000,389800.0,AALR3
...,...,...,...,...,...,...
2021-12-07 00:00:00,24.740000,23.600000,24.629999,23.690001,2497900.0,YDUQ3
2021-12-08 00:00:00,24.639999,23.520000,23.959999,24.410000,6492500.0,YDUQ3
2021-12-09 00:00:00,24.320000,23.389999,23.980000,23.510000,1720300.0,YDUQ3
2021-12-10 00:00:00,24.600000,23.750000,23.830000,24.040001,1651200.0,YDUQ3


In [15]:
total = pd.DataFrame()
grouped = data.groupby(data.Stock)
for s in data.Stock.unique():
    globals()[f"{s}"] = grouped.get_group(f"{s}")
    globals()[f"{s}"]['Close2'] = globals()[f"{s}"].Close.shift(periods=-1) - globals()[f"{s}"].Close
    globals()[f"{s}"]['FutureClose'] = globals()[f"{s}"].Close.shift(periods=-1)
    globals()[f"{s}"].loc[globals()[f"{s}"]['Close2'] < 0, 'Binary_Close'] = 0
    globals()[f"{s}"].loc[globals()[f"{s}"]['Close2'] >= 0, 'Binary_Close'] = 1
    globals()[f"{s}"] = globals()[f"{s}"][1:]
    total = total.append(globals()[f"{s}"], True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  globals()[f"{s}"]['Close2'] = globals()[f"{s}"].Close.shift(periods=-1) - globals()[f"{s}"].Close
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  globals()[f"{s}"]['FutureClose'] = globals()[f"{s}"].Close.shift(periods=-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_val

In [16]:
total.isna().sum()
total = total.dropna()
total

Unnamed: 0,High,Low,Open,Close,Volume,Stock,Close2,FutureClose,Binary_Close
0,19.200001,17.510000,19.190001,18.059999,2523300.0,AALR3,-0.16,17.90,0.0
1,18.379999,17.160000,18.059999,17.900000,996200.0,AALR3,0.09,17.99,1.0
2,18.240000,17.309999,18.000000,17.990000,621000.0,AALR3,-0.24,17.75,0.0
3,18.120001,17.709999,17.950001,17.750000,389800.0,AALR3,-0.10,17.65,0.0
4,18.000000,17.410000,17.980000,17.650000,513700.0,AALR3,-0.16,17.49,0.0
...,...,...,...,...,...,...,...,...,...
1667077,3.730000,3.550000,3.620000,3.550000,374300.0,WEST3,-0.10,3.45,0.0
1667078,3.720000,3.450000,3.560000,3.450000,263500.0,WEST3,0.33,3.78,1.0
1667079,3.810000,3.390000,3.440000,3.780000,1172900.0,WEST3,-0.10,3.68,0.0
1667080,3.770000,3.540000,3.770000,3.680000,454100.0,WEST3,0.04,3.72,1.0


In [17]:
total.corr()

Unnamed: 0,High,Low,Open,Close,Volume,Close2,FutureClose,Binary_Close
High,1.0,0.999831,0.99986,0.999873,-0.001216,-0.060825,0.993158,0.031987
Low,0.999831,1.0,0.999979,0.999974,-0.001213,-0.060667,0.993277,0.032003
Open,0.99986,0.999979,1.0,0.999967,-0.001214,-0.060761,0.99326,0.031982
Close,0.999873,0.999974,0.999967,1.0,-0.001214,-0.060809,0.993287,0.031976
Volume,-0.001216,-0.001213,-0.001214,-0.001214,1.0,7e-06,-0.001213,-0.005893
Close2,-0.060825,-0.060667,-0.060761,-0.060809,7e-06,1.0,0.055061,0.010892
FutureClose,0.993158,0.993277,0.99326,0.993287,-0.001213,0.055061,1.0,0.033249
Binary_Close,0.031987,0.032003,0.031982,0.031976,-0.005893,0.010892,0.033249,1.0


In [18]:
x = total[['High', 'Low', 'Open', 'Volume', 'Close']]
y = total['Binary_Close']
y = y.astype('int')

In [19]:
SEED = 20


train_x, test_x, train_y, test_y = train_test_split(x, y, 
                                                        random_state = SEED,
                                                        test_size = 0.25,
                                                        stratify = y)
print("Training with %d elements and Testing with %d elements" % (len(train_x)*6, len(test_x)*6))

model = LinearSVC(dual=False)
model.fit(train_x, train_y)
predict = model.predict(test_x)

accuracy = accuracy_score(test_y, predict) * 100
print("The accuracy is %.2f%%" % accuracy)

Training with 7499532 elements and Testing with 2499846 elements
The accuracy is 72.85%


### Simulator

In [25]:
def predictClassification(Ticket):
    
    Ticket = Ticket.dropna()
    
    x = Ticket[['High', 'Low', 'Open', 'Volume', 'Close']]
    y = Ticket['FutureClose']
    y = y.astype('int')
    
    SEED = 20
    
    train_x, test_x, train_y, test_y = train_test_split(x, y, random_state = SEED, test_size = 0.25, stratify = y)
    print("Training with %d elements and Testing with %d elements" % (len(train_x)*6, len(test_x)*6))

    classification = LinearSVC(dual=False)
    classification.fit(train_x, train_y)
    predict = classification.predict(test_x)

    accuracy = accuracy_score(test_y, predict) * 100
    print("The accuracy score is %.2f%%" % accuracy)

    High = Ticket["High"].iloc[-1]
    Low = Ticket["Low"].iloc[-1]
    Open = Ticket["Open"].iloc[-1]
    Volume = Ticket["Volume"].iloc[-1]
    Close = Ticket["Close"].iloc[-1]
    entry=[[High, Low, Open, Volume, Close]]
    
    if classification.predict(entry) == [1]:
        print('This stock will increase or no change')
    elif classification.predict(entry) == [0]:
        print('This stock will decrease')

In [26]:
predictClassification(MGLU3)

Training with 11790 elements and Testing with 3936 elements
The accuracy score is 57.16%
This stock will decrease


### Using Dummies

In [27]:
dummy_stratified = DummyClassifier(strategy='stratified')
dummy_stratified.fit(train_x, train_y)
predict = dummy_stratified.predict(test_x)

accuracy = accuracy_score(test_y, predict) * 100
print("The stratified dummy accuracy is %.2f%%" % accuracy)

The stratified dummy accuracy is 60.42%


In [29]:
dummy_mostfrequent = DummyClassifier(strategy='most_frequent')
dummy_mostfrequent.fit(train_x, train_y)
predict = dummy_mostfrequent.predict(test_x)

accuracy = accuracy_score(test_y, predict) * 100
print("The most frequent dummy accuracy is %.2f%%" % accuracy)

The most frequent dummy accuracy is 72.85%


In [30]:
dummy_uniform = DummyClassifier(strategy='uniform')
dummy_uniform.fit(train_x, train_y)
predict = dummy_uniform.predict(test_x)

accuracy = accuracy_score(test_y, predict) * 100
print("The uniform dummy accuracy is %.2f%%" % accuracy)

The uniform dummy accuracy is 50.09%


### Using the model

In [31]:
def predict(High, Low, Open, Volume, Close):
    dictonary = {'High': High, 'Low': Low, 'Open': Open, 'Volume': Volume, 'Close': Close}
    df = pd.DataFrame(data=dictonary)
    model.predict(df)
    if model.predict(df) == [1]:
        print('This stock will increase or no change')
    elif model.predict(df) == [0]:
        print('This stock will decrease')

In [32]:
predict([670], [670], [670], [4], [670])

This stock will increase or no change


## SGD Classifier

In [33]:
SEED = 20

train_x, test_x, train_y, test_y = train_test_split(x, y, random_state = SEED, test_size = 0.25, stratify=y)

SGD = SGDClassifier(max_iter=1000, tol=0.01)
SGD.fit(train_x, train_y)
prediction = SGD.predict(test_x)
score = SGD.score(train_x, train_y)

print('A Acurácia da previsão é de = {:.2f}%'.format(SGD.score(train_x, train_y)*100))

A Acurácia da previsão é de = 70.41%


In [34]:
    High = MGLU3["High"].iloc[-1]
    Low = MGLU3["Low"].iloc[-1]
    Open = MGLU3["Open"].iloc[-1]
    Volume = MGLU3["Volume"].iloc[-1]
    Close = MGLU3["Close"].iloc[-1]
    entry=[[High, Low, Open, Volume, Close]]
    
    print('the Stock tomorrow will close at R$ {}'.format(SGD.predict(entry)[0]))

the Stock tomorrow will close at R$ 1


## Linear Regression

### Simulator

In [35]:
MGLU3

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Stock,Close2,FutureClose,Binary_Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2011-05-03,0.521562,0.50625,0.515625,0.509375,33670400.0,MGLU3,0.005625,0.515000,1.0
2011-05-04,0.515000,0.51000,0.510000,0.515000,39203200.0,MGLU3,-0.003125,0.511875,0.0
2011-05-05,0.512812,0.51000,0.512812,0.511875,35097600.0,MGLU3,-0.003750,0.508125,0.0
2011-05-06,0.511875,0.50000,0.510937,0.508125,38672000.0,MGLU3,-0.005000,0.503125,0.0
2011-05-09,0.507812,0.50125,0.506562,0.503125,667680.0,MGLU3,0.000000,0.503125,1.0
...,...,...,...,...,...,...,...,...,...
2021-12-07 00:00:00,7.720000,7.37000,7.490000,7.620000,85258900.0,MGLU3,-0.810000,6.810000,0.0
2021-12-08 00:00:00,7.570000,6.63000,7.560000,6.810000,255628900.0,MGLU3,-0.530000,6.280000,0.0
2021-12-09 00:00:00,6.760000,6.21000,6.760000,6.280000,257831300.0,MGLU3,0.090000,6.370000,1.0
2021-12-10 00:00:00,6.510000,6.01000,6.430000,6.370000,209180700.0,MGLU3,-0.320000,6.050000,0.0


In [38]:
def predictRegression(Ticket):
    
    Ticket = Ticket.dropna()
    
    x = Ticket[['High', 'Low', 'Open', 'Volume', 'Close']]
    y = Ticket['FutureClose']
    y = y.astype('int')
    
    SEED = 20
    
    train_x, test_x, train_y, test_y = train_test_split(x, y, random_state = SEED, test_size = 0.25)
    print("Training with %d elements and Testing with %d elements" % (len(train_x)*6, len(test_x)*6))

    regression = LinearRegression()
    regression.fit(train_x, train_y)
    y_predicted = regression.predict(test_x)

    print('The Accuracy is {:.2f}%'.format(metrics.r2_score(test_y, y_predicted)*100))
    print('The Accuracy is {:.2f}%'.format(regression.score(train_x, train_y)*100))


    High = Ticket["High"].iloc[-1]
    Low = Ticket["Low"].iloc[-1]
    Open = Ticket["Open"].iloc[-1]
    Volume = Ticket["Volume"].iloc[-1]
    Close = Ticket["Close"].iloc[-1]
    entry=[[High, Low, Open, Volume, Close]]
    
    print('the Stock tomorrow will close at R$ {}'.format(regression.predict(entry)[0]))

In [39]:
predictRegression(MGLU3)

Training with 11790 elements and Testing with 3936 elements
The Accuracy is 99.82%
The Accuracy is 99.75%
the Stock tomorrow will close at R$ 5.925378517364563
