In [1]:
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime, timedelta
from functools import partial
import numpy as np
import psycopg2
import pandas as pd
from sqlalchemy import create_engine
from config import db_password
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

#import necessary libraries 
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols


#from joblib import Parallel, delayed, Model 
#from collections import Counter <--????
#from sklearn.metrics import confusion_matrix
#from imblearn.metrics import classification_report_imbalanced

In [2]:
# GET Tabled input

# creating database engine
db_name = 'Company_Stock_DB'
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/{db_name}"
engine = create_engine(db_string)

# read data from PostgreSQL database table and load into Dataframe instance
stock_df = pd.read_sql("select * from \"company_all_star\"", engine);

#sort the dataframe by ticker column
stock_df.sort_values(by=['ticker'])

# Print the DataFrame
#stock_df.head()

Unnamed: 0,ticker,date_val,company_name,company_url,employee_count,revenue,sector,city_name,state_name,region,...,latitude,longitude,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change
4347,AAPL,2020-03-12,Apple Inc,apple.com,over-10k,over-1b,Technology,Capertino,CA,W,...,36.167269,-115.245979,66.2225,69.9800,63.2375,69.4925,368732128.0,65.7384,938015.0,4.937899
4509,AAPL,2020-11-01,Apple Inc,apple.com,over-10k,over-1b,Technology,Capertino,CA,W,...,36.167269,-115.245979,109.1100,110.6800,107.3200,108.7700,122712099.0,108.6262,865470.0,0.311612
4510,AAPL,2020-11-02,Apple Inc,apple.com,over-10k,over-1b,Technology,Capertino,CA,W,...,36.167269,-115.245979,109.6600,111.4900,108.7300,110.4400,107414082.0,110.4493,640197.0,0.711289
4511,AAPL,2020-11-03,Apple Inc,apple.com,over-10k,over-1b,Technology,Capertino,CA,W,...,36.167269,-115.245979,114.1400,115.5900,112.3500,114.9500,138217782.0,114.6156,846477.0,0.709655
4512,AAPL,2020-11-04,Apple Inc,apple.com,over-10k,over-1b,Technology,Capertino,CA,W,...,36.167269,-115.245979,117.9500,119.6200,116.8686,119.0300,126261074.0,118.4798,762125.0,0.915642
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50530,ZS,2020-10-28,Zscaler Inc,zscaler.com,1k-5k,100m-200m,Technology,San Jose,CA,W,...,37.336191,-121.890583,145.3800,146.5999,140.9300,142.5600,1312013.0,142.8662,16906.0,1.939744
50529,ZS,2020-10-27,Zscaler Inc,zscaler.com,1k-5k,100m-200m,Technology,San Jose,CA,W,...,37.336191,-121.890583,147.4900,147.8000,142.3000,144.0600,1471772.0,144.1055,21393.0,2.325581
50528,ZS,2020-10-26,Zscaler Inc,zscaler.com,1k-5k,100m-200m,Technology,San Jose,CA,W,...,37.336191,-121.890583,151.5400,154.5800,147.9510,148.6400,1596067.0,150.6398,23029.0,1.913686
50541,ZS,2020-11-12,Zscaler Inc,zscaler.com,1k-5k,100m-200m,Technology,San Jose,CA,W,...,37.336191,-121.890583,140.0400,140.3168,131.6700,131.9200,2315499.0,134.6756,30284.0,5.798343


In [3]:
# preserve date column as type object
stock_df['date'] = stock_df['date_val']

# have the user enter beginning date as yyyy-mm-dd
begin_date = '2022-03-01'
# have the user enter ending date as yyyy-mm-dd
end_date = '2022-03-10'
# iteration controls
day_range_of_iter = 3

# Convert the date to datetime64
stock_df['date_val'] = pd.to_datetime(stock_df['date_val'], format='%Y-%m-%d')

stock_df = stock_df.loc[(stock_df['date_val'] >= begin_date)
                     & (stock_df['date_val'] <= end_date)]

# drop throw-aways 
stock_df.drop(["longitude", "latitude", "company_name", "company_url","date_val"], axis=1, inplace=True)


#stock_df

In [4]:
# check dtypes
#stock_df.dtypes

In [5]:
# drop fields that will not be used to represent a period of time
stock_df.drop(columns = ['open_val', 'high_val', 'low_val', 'close_val', 'number_of_transactions', 'city_name', 'state_name', 'number_of_transactions', 'percent_change'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
#stock_df

In [6]:
# unique days in df
unique_days = len(pd.unique(stock_df['date']))
print("unique number of days(number of days in df):", unique_days)

# unique stocks in df
unique_stocks = len(pd.unique(stock_df['ticker']))
print(unique_stocks)

# interation sets
iteration_sets = (unique_days - day_range_of_iter + 1)
print("iteration_sets: ", iteration_sets)

# total records captured
length_of_df = len(stock_df)
print(length_of_df)


unique number of days(number of days in df): 8
102
iteration_sets:  6
816


In [7]:
# sort dataframe by date
sort_date_stock_df = stock_df.sort_values(by=['date', 'ticker'])
next_date_stock_df = sort_date_stock_df

# get beginning dataframe records
b = 0 
# ending record for beginning df
ending_records = iteration_sets * unique_stocks

# starting record for end
x = (unique_days - iteration_sets) * unique_stocks
max_records = unique_days * unique_stocks

begin_df = pd.DataFrame()
end_df = pd.DataFrame()
               
for rec in sort_date_stock_df.iterrows():
    
    if b < ending_records:
        new_begin_df = sort_date_stock_df.iloc[b]
        begin_df = begin_df.append(new_begin_df,ignore_index=False)
    
    if x < max_records: 
        new_end_df = next_date_stock_df.iloc[x]
        end_df = end_df.append(new_end_df,ignore_index=False)
    b=b+1
    x=x+1
   
begin_df.reset_index(drop=True,inplace=True)
begin_df 


Unnamed: 0,country_code,date,employee_count,region,revenue,sector,ticker,volume,volume_weight
0,US,2022-03-01,over-10k,W,over-1b,Technology,AAPL,79724750.0,165.8095
1,US,2022-03-01,5k-10k,W,200m-1b,Technology,ABNB,5028250.0,155.9842
2,US,2022-03-01,over-10k,W,1m-10m,Technology,ADBE,2948450.0,471.9849
3,US,2022-03-01,over-10k,SE,over-1b,Technology,ADI,2578681.0,159.8088
4,US,2022-03-01,over-10k,NE,over-1b,Consumer Discretionary,ADP,1674593.0,206.1608
5,US,2022-03-01,over-10k,W,over-1b,Healthcare,ADSK,1677230.0,215.3437
6,US,2022-03-01,over-10k,MW,over-1b,Energy,AEP,3146513.0,90.9575
7,US,2022-03-01,over-10k,SW,200m-1b,Technology,ALGN,581358.0,494.8399
8,US,2022-03-01,over-10k,W,over-1b,Technology,AMAT,7008727.0,132.4731
9,US,2022-03-01,5k-10k,W,over-1b,Technology,AMD,118054509.0,116.7418


In [8]:
# # sort dataframe by date
# sort_date_stock_df = stock_df.sort_values(by=['date', 'ticker'])
# sort_date_stock_df

end_df.reset_index(drop=True,inplace=True)
end_df

Unnamed: 0,country_code,date,employee_count,region,revenue,sector,ticker,volume,volume_weight
0,US,2022-03-03,over-10k,W,over-1b,Technology,AAPL,83819592.0,163.398
1,US,2022-03-03,5k-10k,W,200m-1b,Technology,ABNB,8397063.0,143.411
2,US,2022-03-03,over-10k,W,1m-10m,Technology,ADBE,2752177.0,452.735
3,US,2022-03-03,over-10k,SE,over-1b,Technology,ADI,2778580.0,157.0311
4,US,2022-03-03,over-10k,NE,over-1b,Consumer Discretionary,ADP,1492278.0,206.8779
5,US,2022-03-03,over-10k,W,over-1b,Healthcare,ADSK,1964126.0,208.8198
6,US,2022-03-03,over-10k,MW,over-1b,Energy,AEP,4675287.0,95.5682
7,US,2022-03-03,over-10k,SW,200m-1b,Technology,ALGN,755752.0,461.5372
8,US,2022-03-03,over-10k,W,over-1b,Technology,AMAT,7098065.0,126.1637
9,US,2022-03-03,5k-10k,W,over-1b,Technology,AMD,100671339.0,109.0619


In [9]:
vwa_df = pd.merge(begin_df, end_df, left_index=True, right_index=True)

In [10]:
# drop fields that will not be used to represent a period of time
vwa_df.drop(columns = ['date_x', 'employee_count_y', 'region_y', 'revenue_y', 'sector_y', 'ticker_y', 'country_code_y', 'date_y'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
vwa_df

Unnamed: 0,country_code_x,employee_count_x,region_x,revenue_x,sector_x,ticker_x,volume_x,volume_weight_x,volume_y,volume_weight_y
0,US,over-10k,W,over-1b,Technology,AAPL,79724750.0,165.8095,83819592.0,163.398
1,US,5k-10k,W,200m-1b,Technology,ABNB,5028250.0,155.9842,8397063.0,143.411
2,US,over-10k,W,1m-10m,Technology,ADBE,2948450.0,471.9849,2752177.0,452.735
3,US,over-10k,SE,over-1b,Technology,ADI,2578681.0,159.8088,2778580.0,157.0311
4,US,over-10k,NE,over-1b,Consumer Discretionary,ADP,1674593.0,206.1608,1492278.0,206.8779
5,US,over-10k,W,over-1b,Healthcare,ADSK,1677230.0,215.3437,1964126.0,208.8198
6,US,over-10k,MW,over-1b,Energy,AEP,3146513.0,90.9575,4675287.0,95.5682
7,US,over-10k,SW,200m-1b,Technology,ALGN,581358.0,494.8399,755752.0,461.5372
8,US,over-10k,W,over-1b,Technology,AMAT,7008727.0,132.4731,7098065.0,126.1637
9,US,5k-10k,W,over-1b,Technology,AMD,118054509.0,116.7418,100671339.0,109.0619


In [11]:
vwa_df['vwa'] = 100 - vwa_df['volume_weight_y']/vwa_df['volume_weight_x'] * 100
vwa_df['vw'] = 100 - vwa_df['volume_y']/vwa_df['volume_x'] * 100

# air_quality["london_mg_per_cubic"] = air_quality["station_london"] * 1.882

In [12]:
vwa_df

Unnamed: 0,country_code_x,employee_count_x,region_x,revenue_x,sector_x,ticker_x,volume_x,volume_weight_x,volume_y,volume_weight_y,vwa,vw
0,US,over-10k,W,over-1b,Technology,AAPL,79724750.0,165.8095,83819592.0,163.398,1.45438,-5.136224
1,US,5k-10k,W,200m-1b,Technology,ABNB,5028250.0,155.9842,8397063.0,143.411,8.06056,-66.997723
2,US,over-10k,W,1m-10m,Technology,ADBE,2948450.0,471.9849,2752177.0,452.735,4.078499,6.65682
3,US,over-10k,SE,over-1b,Technology,ADI,2578681.0,159.8088,2778580.0,157.0311,1.73814,-7.751986
4,US,over-10k,NE,over-1b,Consumer Discretionary,ADP,1674593.0,206.1608,1492278.0,206.8779,-0.347835,10.887123
5,US,over-10k,W,over-1b,Healthcare,ADSK,1677230.0,215.3437,1964126.0,208.8198,3.029529,-17.105346
6,US,over-10k,MW,over-1b,Energy,AEP,3146513.0,90.9575,4675287.0,95.5682,-5.069071,-48.586292
7,US,over-10k,SW,200m-1b,Technology,ALGN,581358.0,494.8399,755752.0,461.5372,6.729995,-29.997695
8,US,over-10k,W,over-1b,Technology,AMAT,7008727.0,132.4731,7098065.0,126.1637,4.762778,-1.274668
9,US,5k-10k,W,over-1b,Technology,AMD,118054509.0,116.7418,100671339.0,109.0619,6.578535,14.724698


In [13]:
# # # sort dataframe by date
# # sort_date_stock_df = stock_df.sort_values(by=['date', 'ticker'])

# # starting record for ending df
# x = (unique_days - iteration_sets) * unique_stocks

# max_records = unique_days * unique_stocks

# end_df = pd.DataFrame()
               
# for rec in next_date_stock_df.iterrows():
#     if x < max_records: 
#         new_end_df = next_date_stock_df.iloc[x]
#         end_df = end_df.append(new_end_df,ignore_index=False)
#     x=x+1

# end_df

In [14]:
# # sort dataframe by date
# end_date_stock_df = stock_df.sort_values(by=['date', 'ticker'])

# # get beginning dataframe records
# j = 0 
# max_records = iteration_sets * unique_stocks

# end_df = pd.DataFrame()

# for iteration_sets in end_date_stock_df.iterrows():
#     if j < max_records: 
#         new_df = end_date_stock_df.iloc[j + day_range_of_iter - 1]
#         end_df = new_df.append(new_df,ignore_index=True)
#     j=j+1
# end_df  


In [15]:
# # date_format = "%Y-%m-%d"
# # a = datetime.strptime(begin_date, date_format)
# # b = datetime.strptime(end_date, date_format)


# # number of days in set is the same as the unique number of days!!!
# # unique days in df
# unique_days = len(pd.unique(stock_df['date']))
# print("unique number of days(number of days in df):", unique_days)

# # unique stocks in df
# unique_stocks = len(pd.unique(stock_df['ticker']))
# print(unique_stocks)

# # unique_days - day_range + 1
# # interation sets
# iteration_sets = (unique_days - day_range_of_iter + 1)
# print("iteration_sets: ", iteration_sets)

# # this lets us know the number of times we are going to have to iterate to get the entire set of VWAs we need 
# #iteration_sets = b - a + timedelta(days=1) - timedelta(days=day_range_of_iter)
# #print (iteration_sets.days) # that's it

# # total records captured
# length_of_df = len(stock_df)
# print(length_of_df)



# # start with the days we want to iterate over 
# # begin_at = 0 for 

# # iteration controls
# #day_range_of_iter = 3

# # how many iterations are there given the number of days and the range of days we want to iterate?




# #day_in_df_as_int = (days_in_df / np.timedelta64(1, 'D')).astype(int)
# #print(day_as_int)
# # divide the number of days in dataframe by the number of stocks
# #div = no_of_stocks/days_in_df
# #print(div)

# # get first stock ticker in first row
# prev_ticker = stock_df.iat[0,0]
# #print (prev_ticker)
# j = 0 
# count = 0




# new_ticker_flag = 'yes'
# process_list=[]

# #print(stock_df.iloc[0])

# # new_df = stock_df.iloc[0]
# # new_df.head()
# date_stock_df = stock_df.sort_values(by=['date'])
# date_stock_df

# j = 0 
# max_records = iteration_sets * unique_stocks 

# # WORKS
# begin_df = pd.DataFrame()
# for iteration_sets in stock_df.iterrows():
#     if j <= max_records:
#         new_df = stock_df.iloc[j]
#         begin_df = begin_df.append(new_df,ignore_index=True)
#     j=j+1
# begin_df.head(60)    

# date_stock_df = stock_df.sort_values(by=['date'])
# date_stock_df

# # j = 0 
# # max_records = iteration_sets * unique_stocks

# # end_df = pd.DataFrame()
# # for iteration_sets in stock_df.iterrows():
# #     if j <= max_records: 
# #         new_df = stock_df.iloc[j + day_range_of_iter - 1]
# #         end_df = end_df.append(new_df,ignore_index=True)
# #     j=j+1
# # end_df.head(60)    



In [16]:


# new_ticker_flag = 'yes'
# process_list=[]

# for iteration_sets in stock_df.iterrows():
#     # get first stock ticker in first row
#     prev_ticker = stock_df.iat[0,j]
#     i = 0
#     if count <= iteration_sets:
#         for rec in stock_df.iterrows():
#             new_ticker = stock_df['ticker'].iloc[i]
#             print("previous ticker and new ticker: ", prev_ticker, "and", new_ticker)

#             if (prev_ticker == new_ticker):
#                 if (new_ticker_flag == 'yes'):

#                     new_ticker_flag = 'no'
#                     begin_vw = stock_df['volume_weight'].iloc[i]

#                 else:

#                     last_vw = stock_df['volume_weight'].iloc[i]
#             else:
#                     vw_average = 100 - (last_vw/begin_vw) * 100
#                     print("vwa: ", vw_average)

#                     # append to process_list
#                     process_list.append(vw_average)

#                     prev_ticker = new_ticker
#                     new_ticker_flag == 'yes'
#                     begin_vw = stock_df['volume_weight'].iloc[i]

#             i=i+1
#         j=j+1
#         count = count + 1

In [17]:
# # get first stock ticker in first row
# prev_ticker = stock_df.iat[0,0]
# print (prev_ticker)

# i = 0
# new_ticker_flag = 'yes'
# process_list=[]

# for rec in stock_df.iterrows():
#     new_ticker = stock_df['ticker'].iloc[i]
#     print("previous ticker and new ticker: ", prev_ticker, "and", new_ticker)

#     if (prev_ticker == new_ticker):
#         if (new_ticker_flag == 'yes'):

#             new_ticker_flag = 'no'
#             begin_vw = stock_df['volume_weight'].iloc[i]
            
#         else:

#             last_vw = stock_df['volume_weight'].iloc[i]
#     else:
#             vw_average = 100 - (last_vw/begin_vw) * 100
#             print("vwa: ", vw_average)
            
#             # append to process_list
#             process_list.append(vw_average)
            
#             prev_ticker = new_ticker
#             new_ticker_flag == 'yes'
#             begin_vw = stock_df['volume_weight'].iloc[i]
            
#     i=i+1
    
# # account for the last record in the dataframe
# vw_average = 100 - (last_vw/begin_vw) * 100
# print("vwa: ", vw_average)       
# process_list.append(vw_average)

In [18]:
# len(process_list)

In [19]:
# process_list

In [20]:
#combine object dataframe with process list into process_df
stock_df = stock_df.drop(['volume', 'volume_weight', 'percent_change','date'], axis=1)
stock_df.columns.tolist()
stock_df = stock_df.drop_duplicates(subset='ticker')
stock_df["volume_weight_avg"] = process_list
stock_df

KeyError: "['percent_change'] not found in axis"

In [None]:
# unique values for each column (getting to know your data)
stock_df.nunique()

In [None]:
stock_df.dtypes

## Indexes, Features (the possible causes), Targets (the desired effects), Throw-Aways

### NOTE: we have to keep our ticker columns (so all this must called within the gradient_boosting_decision_tree_model)

#### Indexes/Primary Key: 

- Concatinate ticker and date to yield ticker_and_date

#### Features are:
- TICKER, 
- DATE
- EMPLOYEE COUNT
- REVENUE
- SECTOR
- COUNTRY CODE
- VOLUME 
- VOLUME WEIGHT 
- AVERAGE_VOLUME (calculate average using begin_volumn/end_value) 
- AVERAGE_VOLUME_WEIGHT (calculate average using begin_date/end_date)
- PERCENT CHANGE (% change from close to open)

#### Target is:
- PERCENT CHANGE (and/or) Volume Weight (???)(I think the percent change matters more because percent change yields better 

#### Throw-aways for modeling:
- COMPANY NAME
- COMPANY URL
- CITY NAME
- STATE NAME
- LATITUDE
- LONGITUDE
- OPEN 
- HIGH 
- LOW
- CLOSE
- VOLUME
- VOLUME WEIGHT
- NUMBER OF TRANSACTIONS


In [None]:
# drop stock ticker
# filtered_df = filtered_df.drop(columns = ['city_name'])
# filtered_df.head()
stock_df.drop(columns="ticker", inplace=True)
stock_df.drop(columns="city_name", inplace=True)
stock_df.drop(columns="state_name", inplace=True)

In [None]:
# generate our categorical variable list
# categorical preprocessing can be done easiest using Dataframe.dtypes == 'object'
stock_categories = stock_df.dtypes[stock_df.dtypes == "object"].index.tolist()
stock_categories

In [None]:
# Checking the number of unique values in each column
stock_df[stock_categories].nunique()
# there needs to be only 10 at most in each categorie, how are we going to make this smaller...by sector ???

In [None]:
# I am catagorizing my own shiza from the tiza
# replace stock's employee count string with integer
stock_df.loc[(stock_df['employee_count'] == '5k-10k'), 'employee_count'] = 0
stock_df.loc[(stock_df['employee_count'] == 'over-10k'), 'employee_count'] = 1
stock_df.loc[(stock_df['employee_count'] == '1k-5k'), 'employee_count'] = 2
stock_df.loc[(stock_df['employee_count'] == '500-1k'), 'employee_count'] = 3

# replace stock's revenue string with integer
stock_df.loc[(stock_df['revenue'] == '1m-10m'), 'revenue'] = 0
stock_df.loc[(stock_df['revenue'] == '10m-50m'), 'revenue'] = 1
stock_df.loc[(stock_df['revenue'] == '50m-100m'), 'revenue'] = 2
stock_df.loc[(stock_df['revenue'] == '100m-200m'), 'revenue'] = 3
stock_df.loc[(stock_df['revenue'] == '200m-1b'), 'revenue'] = 4
stock_df.loc[(stock_df['revenue'] == 'over-1b'), 'revenue'] = 5

# replace stock's sector string with integer
stock_df.loc[(stock_df['sector'] == 'Technology'), 'sector'] = 0
stock_df.loc[(stock_df['sector'] == 'Energy'), 'sector'] = 1
stock_df.loc[(stock_df['sector'] == 'Healthcare'), 'sector'] = 2
stock_df.loc[(stock_df['sector'] == 'Consumer Discretionary'), 'sector'] = 3
stock_df.loc[(stock_df['sector'] == 'Industrials'), 'sector'] = 4
stock_df.loc[(stock_df['sector'] == 'Consumer Staples'), 'sector'] = 5
stock_df.loc[(stock_df['sector'] == 'Communication Services'), 'sector'] = 6
stock_df.loc[(stock_df['sector'] == 'Financials'), 'sector'] = 7
stock_df.loc[(stock_df['sector'] == 'Utilities'), 'sector'] = 8

# replace stock's country code string with integer (Note: China was CN and CH for some reason)
stock_df.loc[(stock_df['country_code'] == 'US'), 'country_code'] = 0
stock_df.loc[(stock_df['country_code'] == 'Netherlands'), 'country_code'] = 1
stock_df.loc[(stock_df['country_code'] == 'Australia'), 'country_code'] = 2
stock_df.loc[(stock_df['country_code'] == 'UK'), 'country_code'] = 3
stock_df.loc[(stock_df['country_code'] == 'CH'), 'country_code'] = 4
stock_df.loc[(stock_df['country_code'] == 'CN'), 'country_code'] = 4
stock_df.loc[(stock_df['country_code'] == 'CA'), 'country_code'] = 5
stock_df.loc[(stock_df['country_code'] == 'Argentina'), 'country_code'] = 6

# replace stock's region string with integer 
stock_df.loc[(stock_df['region'] == 'W'), 'region'] = 0
stock_df.loc[(stock_df['region'] == 'MW'), 'region'] = 1
stock_df.loc[(stock_df['region'] == 'SW'), 'region'] = 2
stock_df.loc[(stock_df['region'] == 'NW'), 'region'] = 3
stock_df.loc[(stock_df['region'] == 'SE'), 'region'] = 4
stock_df.loc[(stock_df['region'] == 'NL'), 'region'] = 5
stock_df.loc[(stock_df['region'] == 'AU'), 'region'] = 6
stock_df.loc[(stock_df['region'] == 'NE'), 'region'] = 7
stock_df.loc[(stock_df['region'] == 'GB'), 'region'] = 8
stock_df.loc[(stock_df['region'] == 'CH'), 'region'] = 9
stock_df.loc[(stock_df['region'] == 'CA'), 'region'] = 10


#create buckets for vwa
stock_df.loc[(stock_df['volume_weight_avg'] < 0), 'volume_weight_avg'] = 0
stock_df.loc[(stock_df['volume_weight_avg'] > 0) & (stock_df['volume_weight_avg'] <= 1), 'volume_weight_avg'] = 1
stock_df.loc[(stock_df['volume_weight_avg'] > 1) & (stock_df['volume_weight_avg'] <= 2), 'volume_weight_avg'] = 2
stock_df.loc[(stock_df['volume_weight_avg'] > 2) & (stock_df['volume_weight_avg'] <= 3), 'volume_weight_avg'] = 3
stock_df.loc[(stock_df['volume_weight_avg'] > 3) & (stock_df['volume_weight_avg'] <= 4), 'volume_weight_avg'] = 4
stock_df.loc[(stock_df['volume_weight_avg'] > 4) & (stock_df['volume_weight_avg'] <= 5), 'volume_weight_avg'] = 5
stock_df.loc[(stock_df['volume_weight_avg'] > 5) & (stock_df['volume_weight_avg'] <= 6), 'volume_weight_avg'] = 6
# stock_df.loc[(stock_df['volume_weight_avg'] > 6) & (stock_df['volume_weight_avg'] <= 7), 'volume_weight_avg'] = 7
# stock_df.loc[(stock_df['volume_weight_avg'] > 7) & (stock_df['volume_weight_avg'] <= 8), 'volume_weight_avg'] = 8
# stock_df.loc[(stock_df['volume_weight_avg'] > 8) & (stock_df['volume_weight_avg'] <= 9), 'volume_weight_avg'] = 9
# stock_df.loc[(stock_df['volume_weight_avg'] > 9) & (stock_df['volume_weight_avg'] <= 10), 'volume_weight_avg'] = 10
stock_df.loc[(stock_df['volume_weight_avg'] > 6), 'volume_weight_avg'] = 7

stock_df

In [None]:
# using: obj_df["body_style"].astype('category').cat.codes to define values for each feature

# stock_df['employee_count'] = stock_df['employee_count'].astype('category').cat.codes
# stock_df['revenue'] = stock_df['revenue'].astype('category').cat.codes
# stock_df['sector'] = stock_df['sector'].astype('category').cat.codes
# #stock_df['city_name'] = stock_df['city_name'].astype('category').cat.codes
# # stock_df['state_name'] = stock_df['state_name'].astype('category').cat.codes
# stock_df['country_code'] = stock_df['country_code'].astype('category').cat.codes
# stock_df['region'] = stock_df['region'].astype('category').cat.codes

# stock_df

In [None]:
# # NOTE: Scikit-learn is flexible enough to perform all of the one-hot encodings at the same time.
# #       Remember, the only difference from our single variable examples is that we need to pass our 
# #       categorical variable list

# # Create a OneHotEncoder instance
# enc = OneHotEncoder(sparse=False)

# # Fit and transform the OneHotEncoder using the categorical variable list
# encode_df = pd.DataFrame(enc.fit_transform(stock_df[stock_categories]))

# # Add the encoded variable names to the dataframe
# encode_df.columns = enc.get_feature_names(stock_categories)
# encode_df.head()

# NOTE: OneHotEncoder was giving us mean_squared_error of around 32.00. So, we removed it and categorical codes instead.
#       Now the mean_squared_error is around 25.00. I have tried to remove city. This didn't work so I added region and 
#       removed city and state. I am still getting the same mean_squared_error. This didn't help.

In [None]:
# Check volumne weight average buckets
vwa_counts = stock_df['volume_weight_avg'].value_counts()
vwa_counts

In [None]:
# ['employee_count', 'revenue', 'sector', 'region', 'country_code']
# # using employee_count, revenue, sector, region, country_code and variable weighted average 
# # as the predictor variable and rating as the response variable
# #fit multiple linear regression model
# model = ols('rating ~ assists + rebounds', data=df).fit()

model = ols('volume_weight_avg ~ employee_count + revenue + sector + region + country_code', data = stock_df).fit()

In [None]:
#view model summary
print(model.summary())

In [None]:
# Once again we can create a residual vs. predictor plot for each 
# of the individual predictors using the plot_regress_exog() function 
# from the statsmodels library.

# For example, here’s what the residual vs. predictor plot looks like for the predictor variable assists:
#create residual vs. predictor plot for 'assists'
# fig = plt.figure(figsize=(12,8))
# fig = sm.graphics.plot_regress_exog(model, 'assists', fig=fig)

# create residual vs. predictor plot for the single feature employee_count
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(model, 'region[T.1]', fig=fig)


In [None]:
# when you create a single record for each stocks VWA you have to reste the index OR 
# your merge/join of stock encodes and stock dataframe will not work
#stock_df.reset_index(drop=True,inplace=True)

In [None]:
#print(stock_df.index)
#print(encode_df.index)

In [None]:
# I AM COMING UP WITH NO ROWS HERE ???

# Now that our categorical variables have been encoded, 
# they are ready to replace our unencoded categorical 
# variables in our dataset.

# TWO STEP REPLACE: 

# Merge one-hot encoded features 
#new_stock_df = stock_df.merge(encode_df,left_index=True, right_index=True)

# Drop the original stock categories
#new_stock_df = new_stock_df.drop(columns=stock_categories)
#new_stock_df.head()


In [None]:
# create features array
X = stock_df.drop(columns=["volume_weight_avg"]).values
    
# create target
y = stock_df["volume_weight_avg"].values

In [None]:
# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [None]:
# max_depth refers to the number of leaves of each tree 
# n_estimators refers to the total number of trees in the ensemble
# learning_rate hyperparameter scales the contribution of each tree NOTE: If you set it to a low value, 
# you will need more trees in the ensemble to fit the training set, but the overall variance will be lower.

# best way to tune the model: https://neptune.ai/blog/lightgbm-parameters-guide
    
regressor = GradientBoostingRegressor(
max_depth=16,
n_estimators=100,
learning_rate=.01
)
regressor.fit(X_train, y_train)


In [None]:
# Use staged_predict() method to measures the validation error at each stage of training 
# (i.e. with one tree, with two trees…) to find the optimal number of trees.
errors = [mean_squared_error(y_test, y_pred) for y_pred in 
           regressor.staged_predict(X_test)]
print(errors)

# mean_squared_error: 
# The smaller the mean squared error, the closer you are to finding the line of best fit. Depending on your data, 
# it may be impossible to get a very small value for the mean squared error. For example, the above data is scattered 
# wildly around the regression line, so 6.08 is as good as it gets (and is in fact, the line of best fit). It is 
# bucketting the VWA that works. 


In [None]:
# best_n_estimators = np.argmin(errors) + 1
best_n_estimators = np.argmin(errors)

print(best_n_estimators)

In [None]:
# build and fit our model using the optimal number of trees
best_regressor = GradientBoostingRegressor(
     max_depth=10,
     n_estimators=best_n_estimators,
     learning_rate=.01
)

best_regressor.fit(X_train, y_train)

# # Sklearn provides numerous metrics to evaluate 
# # the performance of our machine learning models.
# # They categorize the each metric according 
# # to the problem domain which they’re applicable. 
# # https://scikit-learn.org/stable/modules/model_evaluation.html <-- GO TO THIS SITE TO SEE WHICH METRICS YOU WILL USE.

# # We use the mean absolute error 
# # which can be interpreted as 
# # the average distance from 
# # our predictions and the actual values

# # this will give you the value of the stocks for the next period of time
y_pred = best_regressor.predict(X_test)

print(X_train)
print(y_train)

print(X_test)
print(y_pred)

# # this is the how well the model performed (looking for smallest error)
mean_absolute_error(y_test, y_pred)

In [None]:
# should we be using r2_score?
# how do you do residual plots?

In [None]:
print(r2_score(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))

In [None]:

# #     # Tomas: correlation analysis to see how your features are correlated to each other
    
# #     # as with any regression you need to minimize the mean square error.
#                                                         ------------------
# #     examples are at : 
# # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html#sklearn.metrics.mean_squared_error
# #     from sklearn.metrics import mean_squared_error
    
# # EMPTY PROCESS DATAFRAME   
    
# #     # accrossed all stocks, what is the average score.
# #     # what is the mean?
# #     # what is the median?
# #     # do we have any outliers that we need to note
# #     # does this work better for same sectors?