In [1]:
import warnings
warnings.filterwarnings('ignore')

from datetime import datetime, timedelta
from functools import partial

import numpy as np
import psycopg2
import pandas as pd
from sqlalchemy import create_engine
from config import db_password

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_boston
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import balanced_accuracy_score


#from joblib import Parallel, delayed, Model 
#from collections import Counter <--????
#from sklearn.metrics import confusion_matrix
#from imblearn.metrics import classification_report_imbalanced

In [2]:
# GET Tabled input

# creating database engine
db_name = 'Company_Stock_DB'
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/{db_name}"
engine = create_engine(db_string)

# read data from PostgreSQL database table and load into Dataframe instance
stock_df = pd.read_sql("select * from \"view_company_all_star\"", engine);

#sort the dataframe by ticker column
stock_df.sort_values(by=['ticker'])

# Print the DataFrame
stock_df.tail()

Unnamed: 0,ticker,date_val,company_name,company_url,employee_count,revenue,sector,city_name,state_name,country_code,latitude,longitude,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change
50869,ZS,2022-03-06,Zscaler Inc,zscaler.com,1k-5k,100m-200m,Technology,San Jose,CA,US,37.336191,-121.890583,228.37,229.97,204.36,204.37,4379337.0,210.5799,72096.0,10.509261
50870,ZS,2022-03-07,Zscaler Inc,zscaler.com,1k-5k,100m-200m,Technology,San Jose,CA,US,37.336191,-121.890583,203.5,203.92,190.13,198.63,4389634.0,196.9284,71180.0,2.39312
50871,ZS,2022-03-08,Zscaler Inc,zscaler.com,1k-5k,100m-200m,Technology,San Jose,CA,US,37.336191,-121.890583,203.84,213.57,199.12,212.35,3050554.0,209.3268,45960.0,4.174843
50872,ZS,2022-03-09,Zscaler Inc,zscaler.com,1k-5k,100m-200m,Technology,San Jose,CA,US,37.336191,-121.890583,212.13,213.51,204.87,208.41,2305091.0,208.7971,40754.0,1.753642
50873,ZS,2022-03-10,Zscaler Inc,zscaler.com,1k-5k,100m-200m,Technology,San Jose,CA,US,37.336191,-121.890583,211.02,211.67,200.5,201.14,1893573.0,202.9376,37307.0,4.682021


In [3]:
stock_df['date'] = stock_df['date_val']

# have the user enter beginning date as yyyy-mm-dd
begin_date = '2022-03-08'
# have the user enter ending date as yyyy-mm-dd
end_date = '2022-03-10'

# Convert the date to datetime64
stock_df['date_val'] = pd.to_datetime(stock_df['date_val'], format='%Y-%m-%d')

filtered_df = stock_df.loc[(stock_df['date_val'] >= begin_date)
                     & (stock_df['date_val'] <= end_date)]

# drop columns that are not features for modeling
filtered_df = filtered_df.drop(columns = ['date_val','company_name', 'company_url', 'latitude', 'longitude', 'open_val', 
                                          'high_val', 'low_val', 'close_val', 'number_of_transactions', 
                                          'volume', 'volume_weight'])

# filtered_df = filtered_df.drop(columns = ['date_val','company_name', 'company_url', 'latitude', 'longitude', 'open_val', 
#                                     'high_val', 'low_val', 'close_val', 'number_of_transactions'])

filtered_df

# complete next week
# get first stock ticker in first row
# prev_ticker = filtered_df.iat[0,0]
#print (prev_ticker)

# i = 0
# new_ticker_flag = 'yes'

# # for rec in filtered_df.iterrows():
# for rec in filtered_df:
#     new_ticker = filtered_df['ticker']
#     new_ticker = new_ticker.iloc[i]
#     print("previous ticker and new ticker: ", prev_ticker, "and", new_ticker)

#     if (prev_ticker == new_ticker):
#         if (new_ticker_flag == 'yes'):
#             new_ticker_flag = 'no'
#             print(new_ticker_flag)
#             begin_vw = filtered_df['volume_weight']
#             begin_vw = begin_vw.iloc[i]
#             print("begin_vw: ", begin_vw)

#         else:
#             last_vw = filtered_df['volume_weight']
#             last_vw = last_vw.iloc[i]
#             print("last_vw: ", last_vw)

            
#     else:
#             print ("new")
#             vw_average = (last_vw/begin_vw) * 100
#             print("vwa: ", vw_average)
#             prev_ticker = new_ticker
# #             new_ticker_flag == 'yes'
#             begin_vw = filtered_df['volume_weight']
#             begin_vw = begin_vw.iloc[i]
            
#     i=i+1

        
    
#     new_ticker = filtered_df['ticker']
#     new_ticker = new_ticker.head(1)
#     print(new_ticker)
#     if prev_ticker == new_ticker: 
#         print ("good")

#creating records containing the differences between the beginning and ending date volume and volume_weight for each stock
# so that we only have one record for each stock. drop volume and weight and percent change (???)

# move beginning volume

# move ending volume

# volume percent change (end/begin) * 100

# move beginning volume weight

# move ending volume weight

# volume weight percent change (end/begin) * 100

#filtered_df

# add the columns to the processing dataframe (creating a new dataframe)
# this new dataframe will have ticket, employee_count, revenue, sector, city_name, 







Unnamed: 0,ticker,employee_count,revenue,sector,city_name,state_name,country_code,percent_change,date
501,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,US,2.435200,2022-03-08
502,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,US,2.231610,2022-03-09
503,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,US,3.551281,2022-03-10
1006,ADBE,over-10k,1m-10m,Technology,San Jose,CA,US,1.593060,2022-03-08
1007,ADBE,over-10k,1m-10m,Technology,San Jose,CA,US,1.288567,2022-03-09
...,...,...,...,...,...,...,...,...,...
50368,ZM,1k-5k,100m-200m,Technology,San Jose,CA,US,2.366892,2022-03-09
50369,ZM,1k-5k,100m-200m,Technology,San Jose,CA,US,5.179745,2022-03-10
50871,ZS,1k-5k,100m-200m,Technology,San Jose,CA,US,4.174843,2022-03-08
50872,ZS,1k-5k,100m-200m,Technology,San Jose,CA,US,1.753642,2022-03-09


In [4]:
# unique values for each column (getting to know your data)
filtered_df.nunique()

ticker            102
employee_count      4
revenue             6
sector              9
city_name          70
state_name         28
country_code        8
percent_change    306
date                3
dtype: int64

In [5]:
filtered_df.dtypes

ticker             object
employee_count     object
revenue            object
sector             object
city_name          object
state_name         object
country_code       object
percent_change    float64
date               object
dtype: object

## Indexes, Features (the possible causes), Targets (the desired effects), Throw-Aways

### NOTE: we have to keep our ticker columns (so all this must called within the gradient_boosting_decision_tree_model)

#### Indexes/Primary Key: 

- Concatinate ticker and date to yield ticker_and_date

#### Features are:
- TICKER, 
- DATE
- EMPLOYEE COUNT
- REVENUE
- SECTOR
- CITY NAME
- STATE NAME
- COUNTRY CODE
- VOLUME 
- VOLUME WEIGHT 
- AVERAGE_VOLUME (calculate average using begin_date/end_date) (???)
- AVERAGE_VOLUME_WEIGHT (calculate average using begin_date/end_date) (???)
- PERCENT CHANGE (% change from close to open)

#### Target is:
- PERCENT CHANGE (and/or) Volume Weight (???)(I think the percent change matters more because percent change yields better 

#### Throw-aways for modeling:
- COMPANY NAME
- COMPANY URL
- LATITUDE
- LONGITUDE
- OPEN 
- HIGH 
- LOW
- CLOSE
- VOLUME
- VOLUME WEIGHT
- NUMBER OF TRANSACTIONS


In [6]:
# drop stock ticker
filtered_df = filtered_df.drop(columns = ['city_name'])
filtered_df.head()

Unnamed: 0,ticker,employee_count,revenue,sector,state_name,country_code,percent_change,date
501,AMD,5k-10k,over-1b,Technology,CA,US,2.4352,2022-03-08
502,AMD,5k-10k,over-1b,Technology,CA,US,2.23161,2022-03-09
503,AMD,5k-10k,over-1b,Technology,CA,US,3.551281,2022-03-10
1006,ADBE,over-10k,1m-10m,Technology,CA,US,1.59306,2022-03-08
1007,ADBE,over-10k,1m-10m,Technology,CA,US,1.288567,2022-03-09


In [7]:
# generate our categorical variable list
# categorical preprocessing can be done easiest using Dataframe.dtypes == 'object'
stock_categories = filtered_df.dtypes[filtered_df.dtypes == "object"].index.tolist()
stock_categories

['ticker',
 'employee_count',
 'revenue',
 'sector',
 'state_name',
 'country_code',
 'date']

In [8]:
# Checking the number of unique values in each column
filtered_df[stock_categories].nunique()
# there needs to be only 10 at most in each categorie, how are we going to make this smaller...by sector ???

ticker            102
employee_count      4
revenue             6
sector              9
state_name         28
country_code        8
date                3
dtype: int64

In [11]:
#creating instance of one-hot-encoder
encoder = OneHotEncoder(handle_unknown='ignore')

#perform one-hot encoding on 'team' column 
encoder_df = pd.DataFrame(encoder.fit_transform(filtered_df[['sector']]).toarray())

#merge one-hot encoded columns back with original DataFrame
final_df = filtered_df.join(encoder_df)

#view final df
pd.set_option("max_rows", None)
final_df


Unnamed: 0,ticker,employee_count,revenue,sector,state_name,country_code,percent_change,date,0,1,2,3,4,5,6,7,8
501,AMD,5k-10k,over-1b,Technology,CA,US,2.4352,2022-03-08,,,,,,,,,
502,AMD,5k-10k,over-1b,Technology,CA,US,2.23161,2022-03-09,,,,,,,,,
503,AMD,5k-10k,over-1b,Technology,CA,US,3.551281,2022-03-10,,,,,,,,,
1006,ADBE,over-10k,1m-10m,Technology,CA,US,1.59306,2022-03-08,,,,,,,,,
1007,ADBE,over-10k,1m-10m,Technology,CA,US,1.288567,2022-03-09,,,,,,,,,
1008,ADBE,over-10k,1m-10m,Technology,CA,US,5.295001,2022-03-10,,,,,,,,,
1321,ABNB,5k-10k,200m-1b,Technology,CA,US,1.980334,2022-03-08,,,,,,,,,
1322,ABNB,5k-10k,200m-1b,Technology,CA,US,4.531056,2022-03-09,,,,,,,,,
1323,ABNB,5k-10k,200m-1b,Technology,CA,US,5.832739,2022-03-10,,,,,,,,,
1825,ALGN,over-10k,200m-1b,Technology,AZ,US,2.038225,2022-03-08,,,,,,,,,


In [None]:
# NOTE: Scikit-learn is flexible enough to perform all of the one-hot encodings at the same time.
#       Remember, the only difference from our single variable examples is that we need to pass our 
#       categorical variable list

# Create a OneHotEncoder instance
# enc = OneHotEncoder(sparse=False)
enc=OneHotEncoder(handle_unknown='ignore')
# Fit and transform the OneHotEncoder using the categorical variable list
# encode_stock_df = pd.DataFrame(enc.fit_transform(filtered_df[stock_categories]))
encode_emp_count_df = pd.DataFrame(enc.fit_transform(filtered_df['employee_count']).toarray())
final_df = filtered_df.join(encode_emp_count_df)
final_df.head()


# Add the encoded variable names to the dataframe
# encode_stock_df.columns = enc.get_feature_names(stock_categories)
# pd.set_option('display.max_columns', None)
# encode_stock_df.head()

In [None]:
# I AM COMING UP WITH NO ROWS HERE ???

# Now that our categorical variables have been encoded, 
# they are ready to replace our unencoded categorical 
# variables in our dataset.

# TWO STEP REPLACE: 

# Merge one-hot encoded features 
# filtered_df = filtered_df.merge(encode_stock_df,left_index=True, right_index=True)
filtered_df = filtered_df.join(encode_stock_df)
filtered_df
# Drop the original application categories
# filtered_df = filtered_df.drop(columns=stock_categories)
# filtered_df.head()

In [None]:
# create features array
X = filtered_df.drop(columns=["volume_weight"]).values
    
# create target
y = filtered_df["volume_weight"].values

In [None]:
# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
# max_depth refers to the number of leaves of each tree 
# n_estimators refers to the total number of trees in the ensemble
# learning_rate hyperparameter scales the contribution of each tree NOTE: If you set it to a low value, 
# you will need more trees in the ensemble to fit the training set, but the overall variance will be lower.

# best way to tune the model: https://neptune.ai/blog/lightgbm-parameters-guide
    
regressor = GradientBoostingRegressor(
max_depth=2,
n_estimators=3,
learning_rate=1.0
)
regressor.fit(X_train, y_train)

In [None]:

    
#      # Use staged_predict() method to measures the validation error at each stage of training 
#      # (i.e. with one tree, with two trees…) to find the optimal number of trees.
#      errors = [mean_squared_error(y_test, y_pred) for y_pred in 
#                regressor.staged_predict(X_test)]

#       best_n_estimators = np.argmin(errors)

#      # build and fit our model using the optimal number of trees
#      best_regressor = GradientBoostingRegressor(
#           max_depth=2,
#           n_estimators=best_n_estimators,
#           learning_rate=1.0
#       )
#       best_regressor.fit(X_train, y_train)
    
# #     # Sklearn provides numerous metrics to evaluate 
# #     # the performance of our machine learning models.
# #     # They categorize the each metric according 
# #     # to the problem domain which they’re applicable. 
# #     # https://scikit-learn.org/stable/modules/model_evaluation.html <-- GO TO THIS SITE TO SEE WHICH METRICS YOU WILL USE.
    
# #     # We use the mean absolute error 
# #     # which can be interpreted as 
# #     # the average distance from 
# #     # our predictions and the actual values

        # this will give you the value of the stocks for the next period of time
#       y_pred = best_regressor.predict(X_test)

        # this is the how well the model performed (looking for smallest error)
#       mean_absolute_error(y_test, y_pred)
    
    #------------------------------------------------------------------------------------
#       # you are going to have to take the metric(s) and store them into 
    
# #     # Tomas:  including the adj. R2
# #     https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html#sklearn.metrics.r2_score
        
# #     from sklearn.metrics import r2_score
#                                   -------------
#       r2_score(y_true, y_pred)
    
# #     # Tomas: correlation analysis to see how your features are correlated to each other
    
# #     # as with any regression you need to minimize the mean square error.
#                                                         ------------------
# #     examples are at : 
# # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html#sklearn.metrics.mean_squared_error
# #     from sklearn.metrics import mean_squared_error
    
# # EMPTY PROCESS DATAFRAME   
    
# #     # accrossed all stocks, what is the average score.
# #     # what is the mean?
# #     # what is the median?
# #     # do we have any outliers that we need to note
# #     # does this work better for same sectors?
    

In [None]:


# # the other things for bucket
# # # you have to make all the columns a number

# # #prime prev_ticker with first record's ticker value in the datafrome
# prev_ticker = stock_df[ticker] # APPL

# for record in stock_df: 
#         new_ticker = stock_df[ticker] #APPL
        
#         if (new_ticker == prev_ticker):
            
#             prev_ticker = stock_df[ticker] #AMD
#             # move record to processing dataframe 
#             process_df = process_df.append(record, ignore_index=True)
#         else: 
#             # we have all records for given ticker, perform GBDT 
#             def gradient_boosting_decision_tree(processing_df):
        

In [None]:
# for col in stock_df: 
#         print(df['ticker'])

print(stock_df['ticker'].unique())

In [None]:
AMD_df = stock_df.loc[stock_df['ticker'] == 'AMD']
AMD_df.head()

In [None]:
# remove all the columns having the same informaiton in them for a single stock


In [None]:
# generate our categorical variable list
# categorical preprocessing can be done easiest using Dataframe.dtypes == 'object'
stock_categories = stock_df.dtypes[stock_df.dtypes == "object"].index.tolist()
stock_categories

In [None]:
# NOTE: Scikit-learn is flexible enough to perform all of the one-hot encodings at the same time.
#       Remember, the only difference from our single variable examples is that we need to pass our 
#       categorical variable list

# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_stock_df = pd.DataFrame(enc.fit_transform(stock_df[stock_categories]))

# Add the encoded variable names to the dataframe
encode_stock_df.columns = enc.get_feature_names(stock_categories)
encode_stock_df.head()

In [None]:
encode_stock_df.tail()

In [None]:
# # Now that our categorical variables have been encoded, 
# # they are ready to replace our unencoded categorical 
# # variables in our dataset.

# # TWO STEP REPLACE: 

# # Merge one-hot encoded features 
# stock_df = stock_df.merge(encode_stock_df,left_index=True, right_index=True)

# # Drop the original stock categories
# stock_df = stock_df.drop(columns=stock_categories)
# stock_df.head()