In [49]:
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime, timedelta
from functools import partial
import numpy as np
import psycopg2
import pandas as pd
from sqlalchemy import create_engine
from config import db_password
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

#import necessary libraries 
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols


#from joblib import Parallel, delayed, Model 
#from collections import Counter <--????
#from sklearn.metrics import confusion_matrix
#from imblearn.metrics import classification_report_imbalanced

In [14]:
# GET Tabled input

# creating database engine
db_name = 'Company_Stock_DB'
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/{db_name}"
engine = create_engine(db_string)

# read data from PostgreSQL database table and load into Dataframe instance
stock_df = pd.read_sql("select * from \"company_all_star\"", engine);

#sort the dataframe by ticker column
stock_df.sort_values(by=['ticker'])

# Print the DataFrame
stock_df.head()

Unnamed: 0,ticker,date_val,company_name,company_url,employee_count,revenue,sector,city_name,state_name,region,...,latitude,longitude,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change
0,AMD,2020-03-12,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,42.2,43.91,39.6,43.9,86689681.0,41.6701,381223.0,4.028436
1,AMD,2020-03-15,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,39.08,43.37,38.51,38.71,84545868.0,41.0812,374962.0,0.946776
2,AMD,2020-03-16,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,40.19,42.88,38.3,41.88,92741881.0,41.124,434519.0,4.205026
3,AMD,2020-03-17,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,39.54,41.95,36.75,39.12,106949287.0,39.6363,591862.0,1.062215
4,AMD,2020-03-18,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,39.56,41.7,37.69,39.82,88939024.0,40.2337,396388.0,0.65723


In [15]:
# check dtypes
stock_df.dtypes

ticker                     object
date_val                   object
company_name               object
company_url                object
employee_count             object
revenue                    object
sector                     object
city_name                  object
state_name                 object
region                     object
country_code               object
latitude                  float64
longitude                 float64
open_val                  float64
high_val                  float64
low_val                   float64
close_val                 float64
volume                    float64
volume_weight             float64
number_of_transactions    float64
percent_change            float64
dtype: object

In [16]:
# preserve date column as type object
stock_df['date'] = stock_df['date_val']

# have the user enter beginning date as yyyy-mm-dd
begin_date = '2022-03-08'
# have the user enter ending date as yyyy-mm-dd
end_date = '2022-03-10'

# Convert the date to datetime64
stock_df['date_val'] = pd.to_datetime(stock_df['date_val'], format='%Y-%m-%d')

stock_df = stock_df.loc[(stock_df['date_val'] >= begin_date)
                     & (stock_df['date_val'] <= end_date)]

# drop throw-aways 
stock_df.drop(["employee_count", "revenue", "longitude", "latitude", "company_name", "company_url", "date_val"], axis=1, inplace=True)

stock_df

Unnamed: 0,ticker,sector,city_name,state_name,region,country_code,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change,date
501,AMD,Technology,Santa Clara,CA,W,US,108.41,111.71,106.85,111.05,102310329.0,109.6319,602679.0,2.4352,2022-03-08
502,AMD,Technology,Santa Clara,CA,W,US,108.89,109.07,103.07,106.46,102557375.0,105.3382,639388.0,2.23161,2022-03-09
503,AMD,Technology,Santa Clara,CA,W,US,108.13,108.19,104.08,104.29,87584432.0,105.9691,542478.0,3.551281,2022-03-10
1006,ADBE,Technology,San Jose,CA,W,US,443.8,453.11,438.93,450.87,2905656.0,447.8637,67082.0,1.59306,2022-03-08
1007,ADBE,Technology,San Jose,CA,W,US,444.68,447.65,433.01,438.95,2686310.0,437.7568,66371.0,1.288567,2022-03-09
1008,ADBE,Technology,San Jose,CA,W,US,439.66,440.61,415.43,416.38,4434498.0,422.5279,107234.0,5.295001,2022-03-10
1321,ABNB,Technology,San Francisco,CA,W,US,145.43,150.99,145.33,148.31,7023908.0,148.5454,108441.0,1.980334,2022-03-08
1322,ABNB,Technology,San Francisco,CA,W,US,145.22,152.89,144.4092,151.8,5302511.0,149.8916,89683.0,4.531056,2022-03-09
1323,ABNB,Technology,San Francisco,CA,W,US,154.13,155.0,145.0,145.14,4577255.0,147.8527,76301.0,5.832739,2022-03-10
1825,ALGN,Technology,Tempe,AZ,SW,US,433.22,444.84,426.31,442.05,694358.0,438.9188,20325.0,2.038225,2022-03-08


In [17]:
# check dtypes
stock_df.dtypes

ticker                     object
sector                     object
city_name                  object
state_name                 object
region                     object
country_code               object
open_val                  float64
high_val                  float64
low_val                   float64
close_val                 float64
volume                    float64
volume_weight             float64
number_of_transactions    float64
percent_change            float64
date                       object
dtype: object

In [18]:
# drop fields that will not be used to represent a period of time
stock_df.drop(columns = ['open_val', 'high_val', 'low_val', 'close_val', 'number_of_transactions'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
stock_df

Unnamed: 0,ticker,sector,city_name,state_name,region,country_code,volume,volume_weight,percent_change,date
501,AMD,Technology,Santa Clara,CA,W,US,102310329.0,109.6319,2.4352,2022-03-08
502,AMD,Technology,Santa Clara,CA,W,US,102557375.0,105.3382,2.23161,2022-03-09
503,AMD,Technology,Santa Clara,CA,W,US,87584432.0,105.9691,3.551281,2022-03-10
1006,ADBE,Technology,San Jose,CA,W,US,2905656.0,447.8637,1.59306,2022-03-08
1007,ADBE,Technology,San Jose,CA,W,US,2686310.0,437.7568,1.288567,2022-03-09
1008,ADBE,Technology,San Jose,CA,W,US,4434498.0,422.5279,5.295001,2022-03-10
1321,ABNB,Technology,San Francisco,CA,W,US,7023908.0,148.5454,1.980334,2022-03-08
1322,ABNB,Technology,San Francisco,CA,W,US,5302511.0,149.8916,4.531056,2022-03-09
1323,ABNB,Technology,San Francisco,CA,W,US,4577255.0,147.8527,5.832739,2022-03-10
1825,ALGN,Technology,Tempe,AZ,SW,US,694358.0,438.9188,2.038225,2022-03-08


In [44]:
# get first stock ticker in first row
prev_ticker = stock_df.iat[0,0]
print (prev_ticker)

i = 0
new_ticker_flag = 'yes'
process_list=[]
process_list_v=[]

for rec in stock_df.iterrows():
# for rec in stock_df:
    new_ticker = stock_df['ticker']
    new_ticker = new_ticker.iloc[i]
    print("previous ticker and new ticker: ", prev_ticker, "and", new_ticker)

    if (prev_ticker == new_ticker):
        if (new_ticker_flag == 'yes'):
            new_ticker_flag = 'no'
            begin_v = stock_df['volume']
            begin_vw = stock_df['volume_weight']
            begin_vw = begin_vw.iloc[i]
            begin_v = begin_v.iloc[i]
            print("begin_vw: ", begin_vw)
            print("begin_v: ", begin_v)

        else:
            last_vw = stock_df['volume_weight']
            last_v = stock_df['volume']
            last_vw = last_vw.iloc[i]
            last_v = last_v.iloc[i]
            print("last_vw: ", last_vw)
            print("last_v: ", last_v)
            
    else:
            print ("new")
            vw_average = 100 - (last_vw/begin_vw) * 100
            v_average = 100 - (last_v/begin_v) * 100
            print("vwa: ", vw_average)
            print("v: ", v_average)
            # append to the pro
            
            process_list.append(vw_average)
            process_list_v.append(v_average)
            
            prev_ticker = new_ticker
            new_ticker_flag == 'yes'
            begin_vw = stock_df['volume_weight']
            begin_v = stock_df['volume']
            begin_vw = begin_vw.iloc[i]
            begin_v = begin_v.iloc[i]
            print("begin_vw: ", begin_vw)
            print("begin_v: ", begin_v)
            
    i=i+1
    
# account for the last record in the dataframe
# vw_average = 100 - (last_vw/begin_vw) * 100
# print("vwa: ", vw_average)       
# process_list.append(vw_average)
vw_average = 100 - (last_vw/begin_vw) * 100
v_average = 100 - (last_v/begin_v) * 100
process_list.append(vw_average)
process_list_v.append(v_average)   

# v_average = 100 - (last_v/begin_v) * 100
# print("v: ", v_average)       
# process_list.append(v_average)

#     new_ticker = stock_df['ticker']
#     new_ticker = new_ticker.head(1)
#     print(new_ticker)
#     if prev_ticker == new_ticker: 
#         print ("good")

#creating records containing the differences between the beginning and ending date volume and volume_weight for each stock
# so that we only have one record for each stock. drop volume and weight and percent change (???)

# move beginning volume

# move ending volume

# volume percent change (end/begin) * 100

# move beginning volume weight

# move ending volume weight

# volume weight percent change (end/begin) * 100

#stock_df

# add the columns to the processing dataframe (creating a new dataframe)
# this new dataframe will have ticket, employee_count, revenue, sector, city_name, 


AMD
previous ticker and new ticker:  AMD and AMD
begin_vw:  109.6319
begin_v:  102310329.0
previous ticker and new ticker:  AMD and AMD
last_vw:  105.3382
last_v:  102557375.0
previous ticker and new ticker:  AMD and AMD
last_vw:  105.9691
last_v:  87584432.0
previous ticker and new ticker:  AMD and ADBE
new
vwa:  3.340998377297126
v:  14.393362961426888
begin_vw:  447.8637
begin_v:  2905656.0
previous ticker and new ticker:  ADBE and ADBE
last_vw:  437.7568
last_v:  2686310.0
previous ticker and new ticker:  ADBE and ADBE
last_vw:  422.5279
last_v:  4434498.0
previous ticker and new ticker:  ADBE and ABNB
new
vwa:  5.657033601964173
v:  -52.616070174858976
begin_vw:  148.5454
begin_v:  7023908.0
previous ticker and new ticker:  ABNB and ABNB
last_vw:  149.8916
last_v:  5302511.0
previous ticker and new ticker:  ABNB and ABNB
last_vw:  147.8527
last_v:  4577255.0
previous ticker and new ticker:  ABNB and ALGN
new
vwa:  0.46632208065682335
v:  34.8332153553264
begin_vw:  438.9188
begin_

In [45]:
len(process_list)

102

In [46]:
len(process_list_v)

102

In [47]:
# process_list

In [48]:
#combine object dataframe with process list into process_df
stock_df = stock_df.drop(['volume', 'volume_weight', 'percent_change','date'], axis=1)
stock_df.columns.tolist()
#stock_df = stock_df.drop_duplicates(subset='ticker')
stock_df["volume_weight_avg"] = process_list
stock_df

ValueError: Length of values (102) does not match length of index (306)

In [None]:
# unique values for each column (getting to know your data)
stock_df.nunique()

In [None]:
stock_df.dtypes

## Indexes, Features (the possible causes), Targets (the desired effects), Throw-Aways

### NOTE: we have to keep our ticker columns (so all this must called within the gradient_boosting_decision_tree_model)

#### Indexes/Primary Key: 

- Concatinate ticker and date to yield ticker_and_date

#### Features are:
- TICKER, 
- DATE
- EMPLOYEE COUNT
- REVENUE
- SECTOR
- CITY NAME
- STATE NAME
- COUNTRY CODE
- VOLUME 
- VOLUME WEIGHT 
- AVERAGE_VOLUME (calculate average using begin_date/end_date) (???)
- AVERAGE_VOLUME_WEIGHT (calculate average using begin_date/end_date) (???)
- PERCENT CHANGE (% change from close to open)

#### Target is:
- PERCENT CHANGE (and/or) Volume Weight (???)(I think the percent change matters more because percent change yields better 

#### Throw-aways for modeling:
- COMPANY NAME
- COMPANY URL
- LATITUDE
- LONGITUDE
- OPEN 
- HIGH 
- LOW
- CLOSE
- VOLUME
- VOLUME WEIGHT
- NUMBER OF TRANSACTIONS


In [None]:
# drop stock ticker
# filtered_df = filtered_df.drop(columns = ['city_name'])
# filtered_df.head()
stock_df.drop(columns="ticker", inplace=True)
stock_df.drop(columns="city_name", inplace=True)
stock_df.drop(columns="state_name", inplace=True)

In [None]:
# generate our categorical variable list
# categorical preprocessing can be done easiest using Dataframe.dtypes == 'object'
stock_categories = stock_df.dtypes[stock_df.dtypes == "object"].index.tolist()
stock_categories

In [None]:
# Checking the number of unique values in each column
stock_df[stock_categories].nunique()
# there needs to be only 10 at most in each categorie, how are we going to make this smaller...by sector ???

In [None]:
# replace stock's employee count string with integer
stock_df.loc[(stock_df['employee_count'] == '5k-10k'), 'employee_count'] = 0
stock_df.loc[(stock_df['employee_count'] == 'over-10k'), 'employee_count'] = 1
stock_df.loc[(stock_df['employee_count'] == '1k-5k'), 'employee_count'] = 2
stock_df.loc[(stock_df['employee_count'] == '500-1k'), 'employee_count'] = 3

# replace stock's revenue string with integer
stock_df.loc[(stock_df['revenue'] == '1m-10m'), 'revenue'] = 0
stock_df.loc[(stock_df['revenue'] == '10m-50m'), 'revenue'] = 1
stock_df.loc[(stock_df['revenue'] == '50m-100m'), 'revenue'] = 2
stock_df.loc[(stock_df['revenue'] == '100m-200m'), 'revenue'] = 3
stock_df.loc[(stock_df['revenue'] == '200m-1b'), 'revenue'] = 4
stock_df.loc[(stock_df['revenue'] == 'over-1b'), 'revenue'] = 5

# replace stock's sector string with integer
stock_df.loc[(stock_df['sector'] == 'Technology'), 'sector'] = 0
stock_df.loc[(stock_df['sector'] == 'Energy'), 'sector'] = 1
stock_df.loc[(stock_df['sector'] == 'Healthcare'), 'sector'] = 2
stock_df.loc[(stock_df['sector'] == 'Consumer Discretionary'), 'sector'] = 3
stock_df.loc[(stock_df['sector'] == 'Industrials'), 'sector'] = 4
stock_df.loc[(stock_df['sector'] == 'Consumer Staples'), 'sector'] = 5
stock_df.loc[(stock_df['sector'] == 'Communication Services'), 'sector'] = 6
stock_df.loc[(stock_df['sector'] == 'Financials'), 'sector'] = 7
stock_df.loc[(stock_df['sector'] == 'Utilities'), 'sector'] = 8

# replace stock's country code string with integer (Note: China was CN and CH for some reason)
stock_df.loc[(stock_df['country_code'] == 'US'), 'country_code'] = 0
stock_df.loc[(stock_df['country_code'] == 'Netherlands'), 'country_code'] = 1
stock_df.loc[(stock_df['country_code'] == 'Australia'), 'country_code'] = 2
stock_df.loc[(stock_df['country_code'] == 'UK'), 'country_code'] = 3
stock_df.loc[(stock_df['country_code'] == 'CH'), 'country_code'] = 4
stock_df.loc[(stock_df['country_code'] == 'CN'), 'country_code'] = 4
stock_df.loc[(stock_df['country_code'] == 'CA'), 'country_code'] = 5
stock_df.loc[(stock_df['country_code'] == 'Argentina'), 'country_code'] = 6

# replace stock's region string with integer 
stock_df.loc[(stock_df['region'] == 'W'), 'region'] = 0
stock_df.loc[(stock_df['region'] == 'MW'), 'region'] = 1
stock_df.loc[(stock_df['region'] == 'SW'), 'region'] = 2
stock_df.loc[(stock_df['region'] == 'NW'), 'region'] = 3
stock_df.loc[(stock_df['region'] == 'SE'), 'region'] = 4
stock_df.loc[(stock_df['region'] == 'NL'), 'region'] = 5
stock_df.loc[(stock_df['region'] == 'AU'), 'region'] = 6
stock_df.loc[(stock_df['region'] == 'NE'), 'region'] = 7
stock_df.loc[(stock_df['region'] == 'GB'), 'region'] = 8
stock_df.loc[(stock_df['region'] == 'CH'), 'region'] = 9
stock_df.loc[(stock_df['region'] == 'CA'), 'region'] = 10


#create buckets for vwa
stock_df.loc[(stock_df['volume_weight_avg'] < 0), 'volume_weight_avg'] = 0
stock_df.loc[(stock_df['volume_weight_avg'] > 0) & (stock_df['volume_weight_avg'] <= 1), 'volume_weight_avg'] = 1
stock_df.loc[(stock_df['volume_weight_avg'] > 1) & (stock_df['volume_weight_avg'] <= 2), 'volume_weight_avg'] = 2
stock_df.loc[(stock_df['volume_weight_avg'] > 2) & (stock_df['volume_weight_avg'] <= 3), 'volume_weight_avg'] = 3
stock_df.loc[(stock_df['volume_weight_avg'] > 3) & (stock_df['volume_weight_avg'] <= 4), 'volume_weight_avg'] = 4
stock_df.loc[(stock_df['volume_weight_avg'] > 4) & (stock_df['volume_weight_avg'] <= 5), 'volume_weight_avg'] = 5
stock_df.loc[(stock_df['volume_weight_avg'] > 5) & (stock_df['volume_weight_avg'] <= 6), 'volume_weight_avg'] = 6
stock_df.loc[(stock_df['volume_weight_avg'] > 6) & (stock_df['volume_weight_avg'] <= 7), 'volume_weight_avg'] = 7
stock_df.loc[(stock_df['volume_weight_avg'] > 7) & (stock_df['volume_weight_avg'] <= 8), 'volume_weight_avg'] = 8
stock_df.loc[(stock_df['volume_weight_avg'] > 8) & (stock_df['volume_weight_avg'] <= 9), 'volume_weight_avg'] = 9
stock_df.loc[(stock_df['volume_weight_avg'] > 9) & (stock_df['volume_weight_avg'] <= 10), 'volume_weight_avg'] = 10
stock_df.loc[(stock_df['volume_weight_avg'] > 10), 'volume_weight_avg'] = 11

stock_df

In [None]:
# #obj_df["body_style"].astype('category').cat.codes

# # stock_df['employee_count'] = stock_df['employee_count'].astype('category').cat.codes

# # stock_df['revenue'] = stock_df['revenue'].astype('category').cat.codes

# stock_df['sector'] = stock_df['sector'].astype('category').cat.codes
# #stock_df['city_name'] = stock_df['city_name'].astype('category').cat.codes
# # stock_df['state_name'] = stock_df['state_name'].astype('category').cat.codes
# stock_df['country_code'] = stock_df['country_code'].astype('category').cat.codes
# stock_df['region'] = stock_df['region'].astype('category').cat.codes

# stock_df
# # # NOTE: Scikit-learn is flexible enough to perform all of the one-hot encodings at the same time.
# # #       Remember, the only difference from our single variable examples is that we need to pass our 
# # #       categorical variable list

# # # Create a OneHotEncoder instance
# # enc = OneHotEncoder(sparse=False)

# # # Fit and transform the OneHotEncoder using the categorical variable list
# # encode_df = pd.DataFrame(enc.fit_transform(stock_df[stock_categories]))

# # # Add the encoded variable names to the dataframe
# # encode_df.columns = enc.get_feature_names(stock_categories)
# # encode_df.head()

In [None]:
# # NOTE: Scikit-learn is flexible enough to perform all of the one-hot encodings at the same time.
# #       Remember, the only difference from our single variable examples is that we need to pass our 
# #       categorical variable list

# # Create a OneHotEncoder instance
# enc = OneHotEncoder(sparse=False)

# # Fit and transform the OneHotEncoder using the categorical variable list
# encode_df = pd.DataFrame(enc.fit_transform(stock_df[stock_categories]))

# # Add the encoded variable names to the dataframe
# encode_df.columns = enc.get_feature_names(stock_categories)
# encode_df.head()

# NOTE: OneHotEncoder was giving us mean_squared_error of around 32.00. So, we removed it and categorical codes instead.
#       Now the mean_squared_error is around 25.00. I have tried to remove city. This didn't work so I added region and 
#       removed city and state. I am still getting the same mean_squared_error. This didn't help. 

In [None]:
# # So what happens if I try buckets for volumne weight average?

# vwa_counts = stock_df['volume_weight_avg'].value_counts()
# vwa_counts

In [None]:
# ['employee_count', 'revenue', 'sector', 'region', 'country_code']
# # using employee_count, revenue, sector, region, country_code and variable weighted average 
# # as the predictor variable and rating as the response variable
# #fit multiple linear regression model
# model = ols('rating ~ assists + rebounds', data=df).fit()

model = ols('volume_weight_avg ~ employee_count + revenue + sector + region + country_code', data = stock_df).fit()

In [None]:
#view model summary
print(model.summary())

In [None]:
# Once again we can create a residual vs. predictor plot for each 
# of the individual predictors using the plot_regress_exog() function 
# from the statsmodels library.

# For example, here’s what the residual vs. predictor plot looks like for the predictor variable assists:
#create residual vs. predictor plot for 'assists'
# fig = plt.figure(figsize=(12,8))
# fig = sm.graphics.plot_regress_exog(model, 'assists', fig=fig)

# create residual vs. predictor plot for the single feature employee_count
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(model, 'region[T.1]', fig=fig)


In [None]:
# # After examining the data, here are the categories I am going to divide the VWA into these "buckets"

# replace_below_0 = list(vwa_counts[vwa_counts < 0].index)
# below 0%
# above 0%
# above 1%
# above 2%
# above 4%
# above 10
# # Determine which values to replace if counts are less than 1000?
# replace_app_type = list(app_type_counts[app_type_counts < 1000].index)

# # Replace in dataframe
# for app in replace_app_type:
#     application_df.APPLICATION_TYPE = application_df.APPLICATION_TYPE.replace(app, "Other")
    
# # Check to make sure binning was successful
# application_df.APPLICATION_TYPE.value_counts()

In [None]:
#stock_df.reset_index(drop=True,inplace=True)

In [None]:
#print(stock_df.index)
#print(encode_df.index)

In [None]:
# I AM COMING UP WITH NO ROWS HERE ???

# Now that our categorical variables have been encoded, 
# they are ready to replace our unencoded categorical 
# variables in our dataset.

# TWO STEP REPLACE: 

# Merge one-hot encoded features 
#new_stock_df = stock_df.merge(encode_df,left_index=True, right_index=True)

# Drop the original stock categories
#new_stock_df = new_stock_df.drop(columns=stock_categories)
#new_stock_df.head()


In [None]:
# create features array
X = stock_df.drop(columns=["volume_weight_avg"]).values
    
# create target
y = stock_df["volume_weight_avg"].values

In [None]:
# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [None]:
# max_depth refers to the number of leaves of each tree 
# n_estimators refers to the total number of trees in the ensemble
# learning_rate hyperparameter scales the contribution of each tree NOTE: If you set it to a low value, 
# you will need more trees in the ensemble to fit the training set, but the overall variance will be lower.

# best way to tune the model: https://neptune.ai/blog/lightgbm-parameters-guide
    
regressor = GradientBoostingRegressor(
max_depth=100,
n_estimators=100,
learning_rate=.001
)
regressor.fit(X_train, y_train)


In [None]:
# Use staged_predict() method to measures the validation error at each stage of training 
# (i.e. with one tree, with two trees…) to find the optimal number of trees.
errors = [mean_squared_error(y_test, y_pred) for y_pred in 
           regressor.staged_predict(X_test)]
print(errors)

# mean_squared_error: 
# The smaller the mean squared error, the closer you are to finding the line of best fit. Depending on your data, 
# it may be impossible to get a very small value for the mean squared error. For example, the above data is scattered 
# wildly around the regression line, so 6.08 is as good as it gets (and is in fact, the line of best fit). It is 
# bucketting the VWA that works. 


In [None]:
best_n_estimators = np.argmin(errors)

print(best_n_estimators)

In [None]:
# build and fit our model using the optimal number of trees
best_regressor = GradientBoostingRegressor(
     max_depth=10,
     n_estimators=best_n_estimators,
     learning_rate=.01
)

best_regressor.fit(X_train, y_train)

# # Sklearn provides numerous metrics to evaluate 
# # the performance of our machine learning models.
# # They categorize the each metric according 
# # to the problem domain which they’re applicable. 
# # https://scikit-learn.org/stable/modules/model_evaluation.html <-- GO TO THIS SITE TO SEE WHICH METRICS YOU WILL USE.

# # We use the mean absolute error 
# # which can be interpreted as 
# # the average distance from 
# # our predictions and the actual values

# # this will give you the value of the stocks for the next period of time
y_pred = best_regressor.predict(X_test)
print(y_pred)
print(X_test)
print(X_train)
print(y_train)
# # this is the how well the model performed (looking for smallest error)
mean_absolute_error(y_test, y_pred)

In [None]:
# should we be using r2_score?
# how do you do residual plots?

In [None]:
print(r2_score(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))

In [None]:

# #     # Tomas: correlation analysis to see how your features are correlated to each other
    
# #     # as with any regression you need to minimize the mean square error.
#                                                         ------------------
# #     examples are at : 
# # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html#sklearn.metrics.mean_squared_error
# #     from sklearn.metrics import mean_squared_error
    
# # EMPTY PROCESS DATAFRAME   
    
# #     # accrossed all stocks, what is the average score.
# #     # what is the mean?
# #     # what is the median?
# #     # do we have any outliers that we need to note
# #     # does this work better for same sectors?