In [58]:
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime, timedelta
from functools import partial
import numpy as np
import psycopg2
import pandas as pd
from sqlalchemy import create_engine
from config import db_password
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error


#from joblib import Parallel, delayed, Model 
#from collections import Counter <--????
#from sklearn.metrics import confusion_matrix
#from imblearn.metrics import classification_report_imbalanced

In [59]:
# GET Tabled input

# creating database engine
db_name = 'Company_Stock_DB'
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/{db_name}"
engine = create_engine(db_string)

# read data from PostgreSQL database table and load into Dataframe instance
stock_df = pd.read_sql("select * from \"company_all_star\"", engine);

#sort the dataframe by ticker column
stock_df.sort_values(by=['ticker'])

# Print the DataFrame
stock_df.head()

Unnamed: 0,ticker,date_val,company_name,company_url,employee_count,revenue,sector,city_name,state_name,region,...,latitude,longitude,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change
0,AMD,2020-03-12,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,42.2,43.91,39.6,43.9,86689681.0,41.6701,381223.0,4.028436
1,AMD,2020-03-15,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,39.08,43.37,38.51,38.71,84545868.0,41.0812,374962.0,0.946776
2,AMD,2020-03-16,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,40.19,42.88,38.3,41.88,92741881.0,41.124,434519.0,4.205026
3,AMD,2020-03-17,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,39.54,41.95,36.75,39.12,106949287.0,39.6363,591862.0,1.062215
4,AMD,2020-03-18,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,39.56,41.7,37.69,39.82,88939024.0,40.2337,396388.0,0.65723


In [60]:
# check dtypes
stock_df.dtypes

ticker                     object
date_val                   object
company_name               object
company_url                object
employee_count             object
revenue                    object
sector                     object
city_name                  object
state_name                 object
region                     object
country_code               object
latitude                  float64
longitude                 float64
open_val                  float64
high_val                  float64
low_val                   float64
close_val                 float64
volume                    float64
volume_weight             float64
number_of_transactions    float64
percent_change            float64
dtype: object

In [61]:
# preserve date column as type object
stock_df['date'] = stock_df['date_val']

# have the user enter beginning date as yyyy-mm-dd
begin_date = '2022-03-08'
# have the user enter ending date as yyyy-mm-dd
end_date = '2022-03-10'

# Convert the date to datetime64
stock_df['date_val'] = pd.to_datetime(stock_df['date_val'], format='%Y-%m-%d')

stock_df = stock_df.loc[(stock_df['date_val'] >= begin_date)
                     & (stock_df['date_val'] <= end_date)]

# drop throw-aways 
stock_df.drop(["longitude", "latitude", "company_name", "company_url","date_val"], axis=1, inplace=True)

stock_df

Unnamed: 0,ticker,employee_count,revenue,sector,city_name,state_name,region,country_code,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change,date
501,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,108.41,111.71,106.85,111.05,102310329.0,109.6319,602679.0,2.4352,2022-03-08
502,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,108.89,109.07,103.07,106.46,102557375.0,105.3382,639388.0,2.23161,2022-03-09
503,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,108.13,108.19,104.08,104.29,87584432.0,105.9691,542478.0,3.551281,2022-03-10
1006,ADBE,over-10k,1m-10m,Technology,San Jose,CA,W,US,443.8,453.11,438.93,450.87,2905656.0,447.8637,67082.0,1.59306,2022-03-08
1007,ADBE,over-10k,1m-10m,Technology,San Jose,CA,W,US,444.68,447.65,433.01,438.95,2686310.0,437.7568,66371.0,1.288567,2022-03-09
1008,ADBE,over-10k,1m-10m,Technology,San Jose,CA,W,US,439.66,440.61,415.43,416.38,4434498.0,422.5279,107234.0,5.295001,2022-03-10
1321,ABNB,5k-10k,200m-1b,Technology,San Francisco,CA,W,US,145.43,150.99,145.33,148.31,7023908.0,148.5454,108441.0,1.980334,2022-03-08
1322,ABNB,5k-10k,200m-1b,Technology,San Francisco,CA,W,US,145.22,152.89,144.4092,151.8,5302511.0,149.8916,89683.0,4.531056,2022-03-09
1323,ABNB,5k-10k,200m-1b,Technology,San Francisco,CA,W,US,154.13,155.0,145.0,145.14,4577255.0,147.8527,76301.0,5.832739,2022-03-10
1825,ALGN,over-10k,200m-1b,Technology,Tempe,AZ,SW,US,433.22,444.84,426.31,442.05,694358.0,438.9188,20325.0,2.038225,2022-03-08


In [62]:
# check dtypes
stock_df.dtypes

ticker                     object
employee_count             object
revenue                    object
sector                     object
city_name                  object
state_name                 object
region                     object
country_code               object
open_val                  float64
high_val                  float64
low_val                   float64
close_val                 float64
volume                    float64
volume_weight             float64
number_of_transactions    float64
percent_change            float64
date                       object
dtype: object

In [63]:
# drop fields that will not be used to represent a period of time
stock_df.drop(columns = ['open_val', 'high_val', 'low_val', 'close_val', 'number_of_transactions'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
stock_df

Unnamed: 0,ticker,employee_count,revenue,sector,city_name,state_name,region,country_code,volume,volume_weight,percent_change,date
501,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,102310329.0,109.6319,2.4352,2022-03-08
502,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,102557375.0,105.3382,2.23161,2022-03-09
503,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,87584432.0,105.9691,3.551281,2022-03-10
1006,ADBE,over-10k,1m-10m,Technology,San Jose,CA,W,US,2905656.0,447.8637,1.59306,2022-03-08
1007,ADBE,over-10k,1m-10m,Technology,San Jose,CA,W,US,2686310.0,437.7568,1.288567,2022-03-09
1008,ADBE,over-10k,1m-10m,Technology,San Jose,CA,W,US,4434498.0,422.5279,5.295001,2022-03-10
1321,ABNB,5k-10k,200m-1b,Technology,San Francisco,CA,W,US,7023908.0,148.5454,1.980334,2022-03-08
1322,ABNB,5k-10k,200m-1b,Technology,San Francisco,CA,W,US,5302511.0,149.8916,4.531056,2022-03-09
1323,ABNB,5k-10k,200m-1b,Technology,San Francisco,CA,W,US,4577255.0,147.8527,5.832739,2022-03-10
1825,ALGN,over-10k,200m-1b,Technology,Tempe,AZ,SW,US,694358.0,438.9188,2.038225,2022-03-08


In [64]:
# get first stock ticker in first row
prev_ticker = stock_df.iat[0,0]
print (prev_ticker)

i = 0
new_ticker_flag = 'yes'
process_list=[]

for rec in stock_df.iterrows():
# for rec in stock_df:
    new_ticker = stock_df['ticker']
    new_ticker = new_ticker.iloc[i]
    print("previous ticker and new ticker: ", prev_ticker, "and", new_ticker)

    if (prev_ticker == new_ticker):
        if (new_ticker_flag == 'yes'):
            new_ticker_flag = 'no'
            
            begin_vw = stock_df['volume_weight']
            begin_vw = begin_vw.iloc[i]
            print("begin_vw: ", begin_vw)

        else:
            last_vw = stock_df['volume_weight']
            last_vw = last_vw.iloc[i]
            print("last_vw: ", last_vw)
    else:
            print ("new")
            vw_average = 100 - (last_vw/begin_vw) * 100
            print("vwa: ", vw_average)
            # append to the pro
            
            process_list.append(vw_average)
            
            prev_ticker = new_ticker
            new_ticker_flag == 'yes'
            begin_vw = stock_df['volume_weight']
            begin_vw = begin_vw.iloc[i]
            print("begin_vw: ", begin_vw)
            
    i=i+1
    
# account for the last record in the dataframe
vw_average = 100 - (last_vw/begin_vw) * 100
print("vwa: ", vw_average)       
process_list.append(vw_average)

#     new_ticker = stock_df['ticker']
#     new_ticker = new_ticker.head(1)
#     print(new_ticker)
#     if prev_ticker == new_ticker: 
#         print ("good")

#creating records containing the differences between the beginning and ending date volume and volume_weight for each stock
# so that we only have one record for each stock. drop volume and weight and percent change (???)

# move beginning volume

# move ending volume

# volume percent change (end/begin) * 100

# move beginning volume weight

# move ending volume weight

# volume weight percent change (end/begin) * 100

#stock_df

# add the columns to the processing dataframe (creating a new dataframe)
# this new dataframe will have ticket, employee_count, revenue, sector, city_name, 


AMD
previous ticker and new ticker:  AMD and AMD
begin_vw:  109.6319
previous ticker and new ticker:  AMD and AMD
last_vw:  105.3382
previous ticker and new ticker:  AMD and AMD
last_vw:  105.9691
previous ticker and new ticker:  AMD and ADBE
new
vwa:  3.340998377297126
begin_vw:  447.8637
previous ticker and new ticker:  ADBE and ADBE
last_vw:  437.7568
previous ticker and new ticker:  ADBE and ADBE
last_vw:  422.5279
previous ticker and new ticker:  ADBE and ABNB
new
vwa:  5.657033601964173
begin_vw:  148.5454
previous ticker and new ticker:  ABNB and ABNB
last_vw:  149.8916
previous ticker and new ticker:  ABNB and ABNB
last_vw:  147.8527
previous ticker and new ticker:  ABNB and ALGN
new
vwa:  0.46632208065682335
begin_vw:  438.9188
previous ticker and new ticker:  ALGN and ALGN
last_vw:  427.7159
previous ticker and new ticker:  ALGN and ALGN
last_vw:  409.9722
previous ticker and new ticker:  ALGN and AMZN
new
vwa:  6.594978387802016
begin_vw:  2821.5439
previous ticker and new t

In [65]:
len(process_list)

102

In [66]:
process_list

[3.340998377297126,
 5.657033601964173,
 0.46632208065682335,
 6.594978387802016,
 -4.424754830148132,
 1.3722727739390592,
 -0.7954314758188588,
 2.7002619982707046,
 4.143783600875167,
 3.5720857626620557,
 1.7768285272324107,
 3.081222557940393,
 4.454652957482594,
 4.966038309605452,
 0.6542373509377626,
 0.38662619208548676,
 -1.0484591055886199,
 2.271516584550753,
 13.844088371133594,
 2.585469740332144,
 2.6992284649147393,
 0.9108212463412997,
 2.6170625693187333,
 -1.1117883705428824,
 -0.959520527461649,
 -12.722973296310698,
 0.7354831935474238,
 1.1640173669668883,
 2.116434110870898,
 -1.3509078018181242,
 -0.9295294365413014,
 -0.006778412563562597,
 6.680558784611904,
 24.678271646897088,
 2.499841555186208,
 0.6020882554683453,
 1.7621060906746493,
 4.017258831579369,
 -0.7375653459029934,
 0.4168284410323366,
 3.8189208646973327,
 1.3603171334859923,
 0.39557408903225166,
 1.5180937354612922,
 0.489033838112519,
 0.6535197121667125,
 1.0858688404868246,
 4.85508250855

In [67]:
#combine object dataframe with process list into process_df
stock_df = stock_df.drop(['volume', 'volume_weight', 'percent_change','date'], axis=1)
stock_df.columns.tolist()
stock_df = stock_df.drop_duplicates(subset='ticker')
stock_df["volume_weight_avg"] = process_list
stock_df

Unnamed: 0,ticker,employee_count,revenue,sector,city_name,state_name,region,country_code,volume_weight_avg
501,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,3.340998
1006,ADBE,over-10k,1m-10m,Technology,San Jose,CA,W,US,5.657034
1321,ABNB,5k-10k,200m-1b,Technology,San Francisco,CA,W,US,0.466322
1825,ALGN,over-10k,200m-1b,Technology,Tempe,AZ,SW,US,6.594978
2329,AMZN,over-10k,over-1b,Technology,Seattle,WA,NW,US,-4.424755
2833,AMGN,over-10k,over-1b,Technology,Thousand Oaks,CA,W,US,1.372273
3336,AEP,over-10k,over-1b,Energy,Columbus,OH,MW,US,-0.795431
3840,ADI,over-10k,over-1b,Technology,Norwood,MA,SE,US,2.700262
4344,ANSS,1k-5k,200m-1b,Technology,Canonsburg,PA,SE,US,4.143784
4848,AAPL,over-10k,over-1b,Technology,Capertino,CA,W,US,3.572086


In [68]:
# unique values for each column (getting to know your data)
stock_df.nunique()

ticker               102
employee_count         4
revenue                6
sector                 9
city_name             70
state_name            28
region                11
country_code           8
volume_weight_avg    102
dtype: int64

In [69]:
stock_df.dtypes

ticker                object
employee_count        object
revenue               object
sector                object
city_name             object
state_name            object
region                object
country_code          object
volume_weight_avg    float64
dtype: object

## Indexes, Features (the possible causes), Targets (the desired effects), Throw-Aways

### NOTE: we have to keep our ticker columns (so all this must called within the gradient_boosting_decision_tree_model)

#### Indexes/Primary Key: 

- Concatinate ticker and date to yield ticker_and_date

#### Features are:
- TICKER, 
- DATE
- EMPLOYEE COUNT
- REVENUE
- SECTOR
- CITY NAME
- STATE NAME
- COUNTRY CODE
- VOLUME 
- VOLUME WEIGHT 
- AVERAGE_VOLUME (calculate average using begin_date/end_date) (???)
- AVERAGE_VOLUME_WEIGHT (calculate average using begin_date/end_date) (???)
- PERCENT CHANGE (% change from close to open)

#### Target is:
- PERCENT CHANGE (and/or) Volume Weight (???)(I think the percent change matters more because percent change yields better 

#### Throw-aways for modeling:
- COMPANY NAME
- COMPANY URL
- LATITUDE
- LONGITUDE
- OPEN 
- HIGH 
- LOW
- CLOSE
- VOLUME
- VOLUME WEIGHT
- NUMBER OF TRANSACTIONS


In [70]:
# drop stock ticker
# filtered_df = filtered_df.drop(columns = ['city_name'])
# filtered_df.head()
stock_df.drop(columns="ticker", inplace=True)
stock_df.drop(columns="city_name", inplace=True)
stock_df.drop(columns="state_name", inplace=True)

In [71]:
# generate our categorical variable list
# categorical preprocessing can be done easiest using Dataframe.dtypes == 'object'
stock_categories = stock_df.dtypes[stock_df.dtypes == "object"].index.tolist()
stock_categories

['employee_count', 'revenue', 'sector', 'region', 'country_code']

In [72]:
# Checking the number of unique values in each column
stock_df[stock_categories].nunique()
# there needs to be only 10 at most in each categorie, how are we going to make this smaller...by sector ???

employee_count     4
revenue            6
sector             9
region            11
country_code       8
dtype: int64

In [73]:
# #obj_df["body_style"].astype('category').cat.codes

# stock_df['employee_count'] = stock_df['employee_count'].astype('category').cat.codes

# stock_df['revenue'] = stock_df['revenue'].astype('category').cat.codes

stock_df['sector'] = stock_df['sector'].astype('category').cat.codes
#stock_df['city_name'] = stock_df['city_name'].astype('category').cat.codes
# stock_df['state_name'] = stock_df['state_name'].astype('category').cat.codes
stock_df['country_code'] = stock_df['country_code'].astype('category').cat.codes
stock_df['region'] = stock_df['region'].astype('category').cat.codes

# stock_df
# # # NOTE: Scikit-learn is flexible enough to perform all of the one-hot encodings at the same time.
# # #       Remember, the only difference from our single variable examples is that we need to pass our 
# # #       categorical variable list

# # # Create a OneHotEncoder instance
# # enc = OneHotEncoder(sparse=False)

# # # Fit and transform the OneHotEncoder using the categorical variable list
# # encode_df = pd.DataFrame(enc.fit_transform(stock_df[stock_categories]))

# # # Add the encoded variable names to the dataframe
# # encode_df.columns = enc.get_feature_names(stock_categories)
# # encode_df.head()

In [36]:
# # NOTE: Scikit-learn is flexible enough to perform all of the one-hot encodings at the same time.
# #       Remember, the only difference from our single variable examples is that we need to pass our 
# #       categorical variable list

# # Create a OneHotEncoder instance
# enc = OneHotEncoder(sparse=False)

# # Fit and transform the OneHotEncoder using the categorical variable list
# encode_df = pd.DataFrame(enc.fit_transform(stock_df[stock_categories]))

# # Add the encoded variable names to the dataframe
# encode_df.columns = enc.get_feature_names(stock_categories)
# encode_df.head()

# NOTE: OneHotEncoder was giving us mean_squared_error of around 32.00. So, we removed it and categorical codes instead.
#       Now the mean_squared_error is around 25.00. I have tried to remove city. This didn't work so I added region and 
#       removed city and state. I am still getting the same mean_squared_error. This didn't help. 

In [37]:
# # So what happens if I try buckets for volumne weight average?

# vwa_counts = stock_df['volume_weight_avg'].value_counts()
# vwa_counts

In [49]:
# # After examining the data, here are the categories I am going to divide the VWA into these "buckets"

# replace_below_0 = list(vwa_counts[vwa_counts < 0].index)
# below 0%
# above 0%
# above 1%
# above 2%
# above 4%
# above 10
# # Determine which values to replace if counts are less than 1000?
# replace_app_type = list(app_type_counts[app_type_counts < 1000].index)

# # Replace in dataframe
# for app in replace_app_type:
#     application_df.APPLICATION_TYPE = application_df.APPLICATION_TYPE.replace(app, "Other")
    
# # Check to make sure binning was successful
# application_df.APPLICATION_TYPE.value_counts()

In [50]:
#stock_df.reset_index(drop=True,inplace=True)

In [51]:
#print(stock_df.index)
#print(encode_df.index)

In [52]:
# I AM COMING UP WITH NO ROWS HERE ???

# Now that our categorical variables have been encoded, 
# they are ready to replace our unencoded categorical 
# variables in our dataset.

# TWO STEP REPLACE: 

# Merge one-hot encoded features 
#new_stock_df = stock_df.merge(encode_df,left_index=True, right_index=True)

# Drop the original stock categories
#new_stock_df = new_stock_df.drop(columns=stock_categories)
#new_stock_df.head()


In [53]:
# create features array
X = stock_df.drop(columns=["volume_weight_avg"]).values
    
# create target
y = stock_df["volume_weight_avg"].values

In [54]:
# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [55]:
# max_depth refers to the number of leaves of each tree 
# n_estimators refers to the total number of trees in the ensemble
# learning_rate hyperparameter scales the contribution of each tree NOTE: If you set it to a low value, 
# you will need more trees in the ensemble to fit the training set, but the overall variance will be lower.

# best way to tune the model: https://neptune.ai/blog/lightgbm-parameters-guide
    
regressor = GradientBoostingRegressor(
max_depth=100,
n_estimators=100,
learning_rate=.001
)
regressor.fit(X_train, y_train)


GradientBoostingRegressor(learning_rate=0.001, max_depth=100)

In [56]:
# Use staged_predict() method to measures the validation error at each stage of training 
# (i.e. with one tree, with two trees…) to find the optimal number of trees.
errors = [mean_squared_error(y_test, y_pred) for y_pred in 
           regressor.staged_predict(X_test)]
print(errors)

# mean_squared_error: 
# The smaller the mean squared error, the closer you are to finding the line of best fit. Depending on your data, 
# it may be impossible to get a very small value for the mean squared error. For example, the above data is scattered 
# wildly around the regression line, so 6.08 is as good as it gets (and is in fact, the line of best fit). It is 
# bucketting the VWA that works. 


[1.9538607909030812, 1.951827245935553, 1.9498012312870385, 1.947935883668322, 1.9459037039008265, 1.9440740146910735, 1.9422519483168061, 1.940263287802408, 1.9384344384097618, 1.9366128381839283, 1.9347984690405273, 1.9328391521673547, 1.9308649081409608, 1.9289200622115659, 1.9271340823313738, 1.9253552440612245, 1.9234321284435545, 1.9216674454653815, 1.9199098507467949, 1.9181593265577068, 1.916415855207031, 1.9147027994406274, 1.9127992339205204, 1.9111004044311162, 1.9092107677798202, 1.90752609298634, 1.90584866875002, 1.9039890145186613, 1.9023012139554618, 1.9004796065246579, 1.898805479432483, 1.8971382133474184, 1.895303557587631, 1.8934755803463104, 1.8916793380931975, 1.8900641228524966, 1.888455978442955, 1.8868293019976943, 1.8850609005181254, 1.8832993719495037, 1.881701380924766, 1.879927331278085, 1.8781598130914028, 1.8763988103231954, 1.8748182206230366, 1.8731053196228302, 1.871537864551198, 1.8698031025344228, 1.8682486211963436, 1.8667276493579286, 1.86501250920

In [47]:
best_n_estimators = np.argmin(errors)

print(best_n_estimators)

99


In [48]:
# build and fit our model using the optimal number of trees
best_regressor = GradientBoostingRegressor(
     max_depth=10,
     n_estimators=best_n_estimators,
     learning_rate=.01
)

best_regressor.fit(X_train, y_train)

# # Sklearn provides numerous metrics to evaluate 
# # the performance of our machine learning models.
# # They categorize the each metric according 
# # to the problem domain which they’re applicable. 
# # https://scikit-learn.org/stable/modules/model_evaluation.html <-- GO TO THIS SITE TO SEE WHICH METRICS YOU WILL USE.

# # We use the mean absolute error 
# # which can be interpreted as 
# # the average distance from 
# # our predictions and the actual values

# # this will give you the value of the stocks for the next period of time
y_pred = best_regressor.predict(X_test)
print(y_pred)
print(X_test)
print(X_train)
print(y_train)
# # this is the how well the model performed (looking for smallest error)
mean_absolute_error(y_test, y_pred)

[0.83675655 1.05849614 2.3837838  3.357838   3.357838   3.357838
 3.357838   1.78216209 3.357838   2.72756764 1.46702691 2.09729727
 1.01082122 3.52757953 3.63315704 1.93972968 3.357838   3.98810836
 1.68830872 1.46702691 3.357838   3.357838   3.357838   0.97154693
 2.72756764 1.93972968]
[[0 5 0 7 0]
 [1 5 4 4 0]
 [1 5 0 0 0]
 [2 3 0 0 0]
 [0 5 5 0 0]
 [3 4 6 3 0]
 [1 5 5 0 0]
 [0 5 0 4 0]
 [1 5 5 0 0]
 [0 5 4 4 0]
 [2 5 2 4 0]
 [1 5 2 0 0]
 [1 5 4 7 0]
 [3 5 4 4 0]
 [3 2 0 4 0]
 [1 5 0 4 0]
 [2 5 6 0 0]
 [1 5 0 9 4]
 [1 5 3 4 0]
 [1 5 7 0 0]
 [0 0 0 0 0]
 [2 2 0 4 0]
 [1 0 4 0 0]
 [1 5 4 1 0]
 [1 5 0 1 0]
 [1 5 0 4 0]]
[[0 0 7 0 0]
 [1 5 0 0 0]
 [0 4 0 0 0]
 [1 5 2 4 0]
 [1 5 2 3 0]
 [2 3 0 0 0]
 [0 5 0 4 0]
 [1 5 0 0 0]
 [1 5 0 0 0]
 [2 0 0 0 0]
 [1 5 0 4 0]
 [1 5 8 1 0]
 [3 5 0 4 0]
 [0 5 5 7 0]
 [1 5 0 4 0]
 [1 5 0 7 0]
 [0 4 2 0 0]
 [0 5 3 0 0]
 [1 5 3 1 0]
 [0 5 0 0 0]
 [1 5 3 7 0]
 [2 5 2 4 0]
 [1 5 0 3 0]
 [1 5 0 0 0]
 [1 4 3 3 0]
 [1 5 5 1 0]
 [2 3 0 0 0]
 [1 5 0 0 0]
 [1 5 0

1.0011614963945419

In [32]:
# should we be using r2_score?
# how do you do residual plots?

In [33]:
print(r2_score(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))

0.017502263014279817
1.0064344548284574


In [34]:

# #     # Tomas: correlation analysis to see how your features are correlated to each other
    
# #     # as with any regression you need to minimize the mean square error.
#                                                         ------------------
# #     examples are at : 
# # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html#sklearn.metrics.mean_squared_error
# #     from sklearn.metrics import mean_squared_error
    
# # EMPTY PROCESS DATAFRAME   
    
# #     # accrossed all stocks, what is the average score.
# #     # what is the mean?
# #     # what is the median?
# #     # do we have any outliers that we need to note
# #     # does this work better for same sectors?