In [2]:
# KERNEL: mlenv

# Import dependencies
import warnings
warnings.filterwarnings("ignore")

In [3]:
#dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine
from config import db_password

In [4]:
# GET Tabled input
# creating database engine
db_name = 'Company_Stocks_DB'
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/{db_name}"
engine = create_engine(db_string)
# read data from PostgreSQL database table and load into Dataframe instance
stock_df = pd.read_sql("select * from \"view_company_all_star\"", engine);
# Print the DataFrame
stock_df.head()

Unnamed: 0,ticker,date_val,company_name,company_url,employee_count,revenue,sector,city_name,state_name,country_code,latitude,longitude,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change
0,AMD,2020-03-12,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37.233325,-121.684635,42.2,43.91,39.6,43.9,86689681.0,41.6701,381223.0,4.028436
1,AMD,2020-03-15,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37.233325,-121.684635,39.08,43.37,38.51,38.71,84545868.0,41.0812,374962.0,0.946776
2,AMD,2020-03-16,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37.233325,-121.684635,40.19,42.88,38.3,41.88,92741881.0,41.124,434519.0,4.205026
3,AMD,2020-03-17,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37.233325,-121.684635,39.54,41.95,36.75,39.12,106949287.0,39.6363,591862.0,1.062215
4,AMD,2020-03-18,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37.233325,-121.684635,39.56,41.7,37.69,39.82,88939024.0,40.2337,396388.0,0.65723


In [5]:
# # Data Load (Test_File)
# stock_df = pd.read_csv("../Queries/Testing_Files/company_all_star.csv")
# stock_df.head(30)

In [6]:
stock_df.columns.tolist()

['ticker',
 'date_val',
 'company_name',
 'company_url',
 'employee_count',
 'revenue',
 'sector',
 'city_name',
 'state_name',
 'country_code',
 'latitude',
 'longitude',
 'open_val',
 'high_val',
 'low_val',
 'close_val',
 'volume',
 'volume_weight',
 'number_of_transactions',
 'percent_change']

In [7]:
# check dtypes
stock_df.dtypes

ticker                     object
date_val                   object
company_name               object
company_url                object
employee_count             object
revenue                    object
sector                     object
city_name                  object
state_name                 object
country_code               object
latitude                  float64
longitude                 float64
open_val                  float64
high_val                  float64
low_val                   float64
close_val                 float64
volume                    float64
volume_weight             float64
number_of_transactions    float64
percent_change            float64
dtype: object

In [8]:
# training model needs to be "int64" for it to be fit, converted all float64s
# stock_df[[
#     "latitude", 
#     "longitude", 
#     "open_val", 
#     "high_val", 
#     "low_val", 
#     "close_val", 
#     "volume",
#     "volume_weight",
#     "number_of_transactions",
#     "percent_change"]] = stock_df[[
#                                 "latitude", 
#                                 "longitude",
#                                 "open_val", 
#                                 "high_val", 
#                                 "low_val", 
#                                 "close_val", 
#                                 "volume",
#                                 "volume_weight",
#                                 "number_of_transactions",
#                                 "percent_change"
#                                 ]].astype("int")

In [9]:
# verify .astype() changes
stock_df.dtypes

ticker                     object
date_val                   object
company_name               object
company_url                object
employee_count             object
revenue                    object
sector                     object
city_name                  object
state_name                 object
country_code               object
latitude                  float64
longitude                 float64
open_val                  float64
high_val                  float64
low_val                   float64
close_val                 float64
volume                    float64
volume_weight             float64
number_of_transactions    float64
percent_change            float64
dtype: object

In [10]:
stock_df.head(30)

Unnamed: 0,ticker,date_val,company_name,company_url,employee_count,revenue,sector,city_name,state_name,country_code,latitude,longitude,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change
0,AMD,2020-03-12,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37.233325,-121.684635,42.2,43.91,39.6,43.9,86689681.0,41.6701,381223.0,4.028436
1,AMD,2020-03-15,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37.233325,-121.684635,39.08,43.37,38.51,38.71,84545868.0,41.0812,374962.0,0.946776
2,AMD,2020-03-16,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37.233325,-121.684635,40.19,42.88,38.3,41.88,92741881.0,41.124,434519.0,4.205026
3,AMD,2020-03-17,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37.233325,-121.684635,39.54,41.95,36.75,39.12,106949287.0,39.6363,591862.0,1.062215
4,AMD,2020-03-18,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37.233325,-121.684635,39.56,41.7,37.69,39.82,88939024.0,40.2337,396388.0,0.65723
5,AMD,2020-03-19,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37.233325,-121.684635,41.51,42.4825,39.58,39.61,106859502.0,41.2299,533411.0,4.57721
6,AMD,2020-03-22,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37.233325,-121.684635,40.62,42.32,38.95,41.64,101704663.0,40.7188,493186.0,2.511078
7,AMD,2020-03-23,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37.233325,-121.684635,44.04,46.81,43.99,46.22,106794151.0,45.7175,535460.0,4.950045
8,AMD,2020-03-24,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37.233325,-121.684635,46.79,47.875,44.425,44.63,93760389.0,46.5954,488900.0,4.616371
9,AMD,2020-03-25,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,US,37.233325,-121.684635,45.78,47.5,45.4,47.5,73915608.0,46.7201,379715.0,3.757099


In [11]:
# need help with which features are to be used and which to drop

# DROP LONG LAT

In [12]:
# Create our features 
X = stock_df.drop("volume_weight", axis=1)
X = pd.get_dummies(X)

# Target 
y = stock_df["volume_weight"]

In [13]:
# One dataset for geolocation and another for stock behaviour? run each through the model?

In [14]:
X.describe()

Unnamed: 0,latitude,longitude,open_val,high_val,low_val,close_val,volume,number_of_transactions,percent_change,ticker_AAPL,...,state_name_WA,state_name_WI,country_code_Argentina,country_code_Australia,country_code_CA,country_code_CH,country_code_CN,country_code_Netherlands,country_code_UK,country_code_US
count,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0,...,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0,50874.0
mean,37.556716,-90.835472,297.455243,301.673835,293.013166,297.388236,7683480.0,74908.34,1.522338,0.009907,...,0.069348,0.009907,0.009907,0.009907,0.009907,0.019814,0.009907,0.019814,0.009907,0.910839
std,11.28196,51.996324,480.512346,486.195147,474.296604,480.188942,17281080.0,133412.1,1.581511,0.09904,...,0.254047,0.09904,0.09904,0.09904,0.09904,0.139361,0.09904,0.139361,0.09904,0.284979
min,-34.607568,-122.774024,4.38,4.6,4.11,4.44,69543.0,2794.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,37.216678,-121.988571,82.93,84.12,81.543475,82.917,1261210.0,22086.0,0.486388,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,37.560034,-111.940009,155.375,158.0,152.77,155.615,2626642.0,36480.0,1.07412,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,41.875562,-77.035092,302.82,308.075,298.055,302.90125,6403628.0,68993.25,2.017073,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,52.205531,150.998317,3744.0,3773.0782,3696.7929,3731.41,401693400.0,2966979.0,40.454186,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
y.value_counts()

79.7028     2
147.4707    2
75.1563     2
47.3740     2
67.8415     2
           ..
378.2873    1
374.7339    1
365.6679    1
347.4436    1
202.9376    1
Name: volume_weight, Length: 50537, dtype: int64

In [16]:
np.unique(y)

array([   4.4255,    4.517 ,    4.6176, ..., 3720.7017, 3721.5756,
       3722.7632])

In [17]:
# train the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [18]:
# random forest classfier
# n_estimator default at 100
random_forest = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
random_forest.fit(X_train, y_train)

ValueError: Unknown label type: 'continuous'

In [None]:
# confusion matrix
# this matrix is gigantic
y_pred = random_forest.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
balanced_accuracy_score(y_test, y_pred)

In [None]:
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
sorted(zip(X.columns, random_forest.feature_importances_), reverse=True)