![StatModels](https://www.durhamtech.edu/themes/custom/durhamtech/images/durham-tech-logo-web.svg) 

## Applications - Statistical Modeling

This lecture provides foundational knowledge and examples of machine learning modeling concepts by examining stock price data.

---

# Table of Contents

### Jupyter Overview
#### <a href='#1'>Useful Links</a>
#### <a href='#2'>Introduction to Jupyter Notebooks</a>
#### <a href='#3'>Cell Types</a>
* Markdown 
* Code
    1. Running One Cell
    2. Other Run Options

#### <a href='#4'>Tips and Tricks</a>

#### <a href='#55'>Weekly Readings/Videos</a>
#### <a href='#56'>Extra Practice</a>

## Introduction

![FunnyML](https://www.meme-arsenal.com/memes/11f11b5d16eef661677e4c9e989dd2b3.jpg) 

## Data Sources

1. Weather: https://www.weather.gov/wrh/climate?wfo=okx
2. SP 500 Components: https://datahub.io/core/s-and-p-500-companies
3. SP 500 Company Info: https://en.wikipedia.org/wiki/List_of_S%26P_500_companies
4. FRED https://fred.stlouisfed.org/docs/api/fred/series.html
5. TD Ameritrade Data Dicitionary https://developer.tdameritrade.com/content/streaming-data#_Toc504640567



In [None]:
# https://pypi.org/project/yahoo-finance-api2/
# https://github.com/pkout/yahoo_finance_api2

# Uncomment below if you don't have yahoo finance api installed
# pip install yahoo_finance_api2

In [None]:
import requests
import pandas as pd
from pandas.io.json import json_normalize
import time
import math

from yahoo_finance_api2 import share
from yahoo_finance_api2.exceptions import YahooFinanceError

import warnings
import matplotlib.pyplot as plt
import numpy as np
import sklearn

import json
import datetime

warnings.filterwarnings('ignore')
key = 'RGOLSJPSTGVAN4NTN4DLWJE71SU7SIH0'

In [None]:
# view file contents
%ls

In [None]:
tickers=pd.read_csv("constituents_csv.csv")
print(len(tickers))
tickers.head()

In [None]:
ticker_info=pd.read_csv("sp500_info.csv")
print(len(ticker_info))
ticker_info.head()

In [None]:
tickers=pd.merge(tickers,ticker_info,on='Symbol',how='inner')
print(len(tickers))
tickers.head()

In [None]:
del ticker_info

In [None]:
def get_td_price_hist(ticker,period,key,row_count='Blank'):
    time.sleep(1)
    endpoint = 'https://api.tdameritrade.com/v1/marketdata/'+ticker+'/pricehistory'

    ##Define Payload
    payload = {'apikey': key,
    'periodType': 'year',
    'period':period,
    'frequencyType':'daily'}

    ### make request
    try:
        content = requests.get(url = endpoint, params = payload)
    except:
        print('API error, please review.')
        
    ### Convert to dictionary
    dictlist = []
    data = content.json()

    for key, value in data.items():
        temp = [key,value]
        dictlist.append(temp)
        
    try:
        hist_data = pd.DataFrame(dictlist[0][1])
        hist_data['datetime'] = pd.to_datetime(hist_data['datetime'],unit='ms')
        hist_data.sort_values(by=['datetime'],ascending=False)
        hist_data=hist_data.sort_values(by=['datetime'],ascending=True).reset_index()
        hist_data['Date']=hist_data['datetime'].dt.date
        hist_data=hist_data.drop(['index','datetime'],axis=1)
        hist_data['ticker'] = ticker
        if row_count!='Blank':
            return hist_data.tail(row_count)
        else:
            return hist_data
    except:
        df = pd.DataFrame()
        print('running except clause')
        return df
    
def get_fundamental_from_td(ticker,key):
    time.sleep(1)
    endpoint = 'https://api.tdameritrade.com/v1/instruments'
    projection = 'fundamental'

    ##Define Payload
    payload = {'apikey': key,
               'symbol' : ticker,
                'projection': projection,
                }
    
    ### make request
    try:
        content = requests.get(url = endpoint, params = payload)
    except:
        print('API error, please review.')
        
    ### Convert to dictionary
    dictlist = []
    data = content.json()
    for key, value in data.items():
        temp = [key,value]
        dictlist.append(temp)
        
    try:
        df = pd.DataFrame(dictlist[0][1]).T.reset_index(drop=True).iloc[0]
        return df
    except:
        print(dictlist)
        df = pd.DataFrame()
        print(ticker + " not valid.")
        return df
    
def get_yahoo_history(share_name):
    print("Pulling history")

    if share_name[-1]=='2':
        my_share = share.Share(share_name[:-1])
    else:
        my_share = share.Share(share_name)
    symbol_data = None

    try:
        symbol_data = my_share.get_historical(share.PERIOD_TYPE_YEAR,
                                              30000,
                                              share.FREQUENCY_TYPE_DAY,
                                              1)
        df = pd.DataFrame(symbol_data)
        df['timestamp'] = df['timestamp'].astype(str)
        df['timestamp'] = df['timestamp'].map(lambda x: x[:-3])
        df['Date'] =df['timestamp'].astype('int')
        df['Date'] = pd.to_datetime(df['Date'],unit='s')
        df=df.sort_values(by=['Date'],ascending=True)
        df['Date']=df['Date'].dt.date
        df=df.drop(['timestamp'],axis=1)
        df['ticker']=str(share_name.upper())
    except YahooFinanceError as e:
        print(e.message)
        sys.exit(1)
    return df

In [None]:
#ticker=tickers['Symbol'][1]
ticker='^GSPC'
ticker

In [None]:
tickers.iloc[[1]]

In [None]:
pd.DataFrame(get_fundamental_from_td('AOS',key)).T

In [None]:
print(ticker)
#td_data=get_td_price_hist(ticker,1,key,43)
td_data=get_td_price_hist(ticker,1,key)
print(len(td_data))
td_data.tail()

In [None]:
yahoo_data=get_yahoo_history(ticker)
print(len(yahoo_data))
yahoo_data

In [None]:
#pd.merge(yahoo_data,td_data,on=['Date'],how='outer',indicator=True)
#pd.merge(yahoo_data.tail(10),td_data.tail(10),on=['Date'],how='outer',indicator=True).describe()

In [None]:
temp=pd.read_excel('nyc_temp.xlsx')
prec=pd.read_excel('nyc_precip.xlsx')
temp

In [None]:
prec=prec.replace('', np.nan).set_index('Year').stack().reset_index(name='prec').rename(columns={"level_1": "Month"})
temp=temp.replace('', np.nan).set_index('Year').stack().reset_index(name='temp').rename(columns={"level_1": "Month"})
temp

In [None]:
# https://stackoverflow.com/questions/42684530/convert-a-column-in-a-python-pandas-from-string-month-into-int
from calendar import month_abbr

lower_ma = [m.lower() for m in month_abbr]

prec['Month']=prec['Month'].str.lower().map(lambda m: lower_ma.index(m)).astype('Int8')
temp['Month']=temp['Month'].str.lower().map(lambda m: lower_ma.index(m)).astype('Int8')


yahoo_data['month']=pd.to_datetime(yahoo_data.Date).dt.month
yahoo_data['year']=pd.to_datetime(yahoo_data.Date).dt.year
yahoo_data['quarter']=pd.to_datetime(yahoo_data.Date).dt.quarter
yahoo_data=pd.merge(yahoo_data,prec,left_on=['month','year'],right_on=['Month','Year'],how='left')
yahoo_data=yahoo_data.drop(['Month','Year'],axis=1)
yahoo_data=pd.merge(yahoo_data,temp,left_on=['month','year'],right_on=['Month','Year'],how='left')
yahoo_data=yahoo_data.drop(['Month','Year'],axis=1)
yahoo_data

In [None]:
yahoo_data[(yahoo_data['prec']=='M')|(yahoo_data['temp']=='M')]

In [None]:
dff=pd.read_csv('DFF.csv')

unrate=pd.read_csv('UNRATE.csv')
unrate['month']=pd.to_datetime(unrate.DATE).dt.month
unrate['year']=pd.to_datetime(unrate.DATE).dt.year
unrate.drop('DATE',axis=1,inplace=True)

gdp=pd.read_csv('GDPC1.csv')
gdp['quarter']=pd.to_datetime(gdp.DATE).dt.quarter
gdp['year']=pd.to_datetime(gdp.DATE).dt.year
gdp.drop('DATE',axis=1,inplace=True)

yahoo_data=pd.merge(yahoo_data,dff,left_on=pd.to_datetime(yahoo_data.Date),right_on=pd.to_datetime(dff.DATE),how='inner')
yahoo_data=pd.merge(yahoo_data,unrate,on=['month','year'],how='inner')
#yahoo_data=pd.merge(yahoo_data,gdp,on=['quarter','year'],how='inner')
yahoo_data.drop(['key_0','DATE','month','quarter','year'],axis=1,inplace=True)
yahoo_data

In [None]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.shift.html

# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rolling.html
# https://stackoverflow.com/questions/61319814/moving-average-in-pandas-issue-with-first-and-last-rows

# 1 year
days_out=252

yahoo_data['volume_moving'] = yahoo_data['volume'].rolling(days_out).mean().shift(periods=1)
yahoo_data['volume_moving_std']=yahoo_data['volume'].rolling(days_out).std().shift(periods=1)
yahoo_data['close_moving'] = yahoo_data['close'].rolling(days_out).mean().shift(periods=1)
yahoo_data['close_moving_std']=yahoo_data['close'].rolling(days_out).std().shift(periods=1)

# https://stackoverflow.com/questions/42138357/pandas-rolling-slope-calculation

def calc_slope(x):
    slope = np.polyfit(range(len(x)), x, 1)[0]
    return slope

yahoo_data['volume_slope'] = yahoo_data['volume'].rolling(days_out).apply(calc_slope).shift(periods=1)
yahoo_data['close_slope'] = yahoo_data['close'].rolling(days_out).apply(calc_slope).shift(periods=1)

yahoo_data['close_future'] = yahoo_data['close'].shift(periods=-days_out)

yahoo_data.drop(columns=['high','low','volume','ticker','Date'],inplace=True)
yahoo_data.dropna(inplace=True)
yahoo_data

## Machine Learning

![FunnyReg](https://memegenerator.net/img/instances/49880835.jpg)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier

def data_split(df,y_var,scale=False):
    reg_df=df.copy()
    
    # train test split
    #y=reg_df.pop(y_var)
    #X=reg_df
    #x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=35)
    
    # first 80% train
    x_train = reg_df.head(int(len(reg_df)*(0.8)))
    x_test = reg_df.iloc[max(x_train.index):]
    y_train = x_train.pop(y_var)
    y_test = x_test.pop(y_var)
    
    colz=x_train.columns
    
    if scale:
        
        scaler = StandardScaler()
        scaler.fit(x_train)
        x_train = scaler.transform(x_train)
        x_test = scaler.transform(x_test)
        
    return x_train, x_test, y_train, y_test, colz

def regression(x_train, x_test, y_train, y_test, colz):

    # Create linear regression object
    regr = linear_model.LinearRegression()

    # Train the model using the training sets
    regr.fit(x_train, y_train)

    # Make predictions using the testing set
    y_pred = regr.predict(x_test)

    print("Number of training records:", len(y_train))
    print("Number of testing records:",len(y_test))
    print("\nLinear Regression Results")

    # The coefficients
    print('\nCoefficients:')
    for x,y in zip(colz,regr.coef_):
        print(x,y)
    
    #The intercept
    print('\nIntercept:', regr.intercept_)      
    print('\nLinear Regression R^2 score on training data: %.4f' % regr.score(x_train,y_train))
    print('Linear Regression R^2 score on test data: %.4f' % r2_score(y_test, y_pred))
    
def random_forest(x_train, x_test, y_train, y_test, colz, cat=False, est=10):
    # If continous y variable
    random_forest = RandomForestRegressor(n_estimators=est)
    
    # If categorical y variable
    if cat:
        random_forest = RandomForestClassifier(n_estimators=est)
    
    random_forest.fit(x_train, y_train)
    train_acc = random_forest.score(x_train, y_train)
    test_acc = random_forest.score(x_test, y_test)
    
    y_pred = random_forest.predict(x_test)
    
    print('Random Forest Results:')
    
    print('Training acuracy= ',train_acc)
    print('Test accuracy= ',test_acc)

    features = x_train.columns
    importances = random_forest.feature_importances_
    indices = np.argsort(importances)

    plt.subplots(figsize=(15, 11))
    plt.title('Feature Importances')
    plt.barh(range(len(indices)), importances[indices], color='b', align='center')
    plt.yticks(range(len(indices)), [features[i] for i in indices])
    plt.xlabel('Relative Importance')
    plt.show()

def cart(x_train, x_test, y_train, y_test, colz, cat=False):
    # if continuous variable
    cart = DecisionTreeRegressor(random_state=12)
    if cat:
        cart = DecisionTreeClassifier(random_state=12)
    cart.fit(x_train, y_train)
    train_acc = cart.score(x_train, y_train)
    test_acc = cart.score(x_test, y_test)
    
    y_pred = cart.predict(x_test)
    
    print('CART Results:')
    
    print('CART training acuracy= ',train_acc)
    print('CART test accuracy= ',test_acc)
    
def lasso(x_train, x_test, y_train, y_test, colz):
    lasso = linear_model.Lasso(alpha=0.25)
    lasso.fit(x_train, y_train)
    y_pred = lasso.predict(x_test)
    train_acc = lasso.score(x_train, y_train)
    test_acc = lasso.score(x_test, y_test)
    
    print('Lasso Regression Results:')
    print('Training acuracy =',train_acc)
    print('Test accuracy =',test_acc)

In [None]:
x_train, x_test, y_train, y_test, colz = data_split(yahoo_data,'close_future')

In [None]:
random_forest(x_train, x_test, y_train, y_test, colz)

In [None]:
regression(x_train, x_test, y_train, y_test, colz)

In [None]:
lasso(x_train, x_test, y_train, y_test, colz)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier

# https://towardsdatascience.com/what-the-hell-is-perceptron-626217814f53
def perceptron(x_train, x_test, y_train, y_test, colz):
    perceptron = Perceptron(max_iter=13)
    perceptron.fit(x_train, y_train)
    perceptron_train_acc = perceptron.score(x_train, y_train)
    perceptron_test_acc = perceptron.score(x_test, y_test)
    print ('perceptron training acuracy= ',perceptron_train_acc)
    print('perceptron test accuracy= ',perceptron_test_acc)
    
def logreg(x_train, x_test, y_train, y_test, colz):
    logreg = LogisticRegression()
    logreg.fit(x_train, y_train)
    logreg_train_acc = logreg.score(x_train, y_train)
    logreg_test_acc = logreg.score(x_test, y_test)
    print ('logreg training acuracy= ',logreg_train_acc)
    print('logreg test accuracy= ',logreg_test_acc)
    
def svm(x_train, x_test, y_train, y_test, colz):
    print("SVM results:")
    
    svc = SVC()                                                  
    svc.fit(x_train, y_train)                                    
    svc_train_acc = svc.score(x_train, y_train)
    svc_test_acc = svc.score(x_test, y_test)
    print ('SVM training acuracy= ',svc_train_acc)
    print('SVM test accuracy= ',svc_test_acc)
    
def knn(x_train, x_test, y_train, y_test, colz,neighbors=3):
    knn = KNeighborsClassifier(n_neighbors = neighbors)                  
    knn.fit(x_train, y_train)                                    
    knn_train_acc = knn.score(x_train, y_train)
    knn_test_acc = knn.score(x_test, y_test)
    print ('KNN training acuracy= ',knn_train_acc)
    print('KNN test accuracy= ',knn_test_acc)

In [None]:
# https://www.investopedia.com/ask/answers/042415/what-average-annual-return-sp-500.asp
yahoo_data['best']=np.where(((yahoo_data.close_future-yahoo_data.close)/yahoo_data.close) > .08, 1, 0)
#yahoo_data['close_moving']=pd.cut(yahoo_data['close_moving'],3,labels=[0,1,2])
yahoo_data.drop(columns=['close','close_future'],inplace=True)
yahoo_data

In [None]:
yahoo_data['best'].value_counts()

In [None]:
x_train, x_test, y_train, y_test, colz = data_split(yahoo_data,'best')

In [None]:
perceptron(x_train, x_test, y_train, y_test, colz)

In [None]:
logreg(x_train, x_test, y_train, y_test, colz)

In [None]:
cart(x_train, x_test, y_train, y_test, colz, True)

In [None]:
svm(x_train, x_test, y_train, y_test, colz)

In [None]:
knn(x_train, x_test, y_train, y_test, colz,4)

In [None]:
random_forest(x_train, x_test, y_train, y_test, colz, True, 5)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

# Initialize model constructor
def neural_netter(x_train, x_test, y_train, y_test, colz):
    inp_sh=np.array(x_train).shape[1]
    print("Neural Network results:")

    model = Sequential()
    # Add layers sequentially
    model.add(Dense(500, activation='relu', \
                        input_shape=(inp_sh,)))
    # Second
    model.add(Dense(250, activation='relu'))
    # Third
    model.add(Dense(250, activation='softmax'))
    # compile the model
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    # train the model
    NO_EPOCHS = 20
    history = model.fit(x_train, y_train,
                        batch_size=30,
                        epochs=NO_EPOCHS,
                        validation_split=0.2)

    yhat_probs = model.predict(x_test, verbose=0)
    yhat_classes = model.predict_classes(x_test, verbose=0)
    print("Test accuracy:",model.evaluate(x_test,y_test,verbose=0)[1])
    ConfusionMatrix=pd.DataFrame(confusion_matrix(y_test, yhat_classes),columns=['Predicted 0','Predicted 1'],index=['Actual 0','Actual 1'])
    print ('Confusion matrix of test data is: \n',ConfusionMatrix)
    print("Average precision for the 2 classes is - ", precision_score(y_test, yhat_classes, average = None) )
    print("Average recall for the 2 classes is - ", recall_score(y_test, yhat_classes, average = None) )
    def plot_loss_acc(hist):
        f, ax = plt.subplots()
        ax.plot([None] + hist.history['acc'], 'o-')
        ax.plot([None] + hist.history['val_acc'], 'x-')
        # Plot legend and use the best location automatically: loc = 0.
        ax.legend(['Train acc', 'Validation acc'], loc = 0)
        ax.set_title('Training/Validation acc per Epoch')
        ax.set_xlabel('Epoch')
        ax.set_ylabel('Acc') 
        plt.plot()

        f, ax = plt.subplots()
        ax.plot([None] + hist.history['loss'], 'o-',c='r')
        ax.plot([None] + hist.history['val_loss'], 'x-',c='g')
        # Plot legend and use the best location automatically: loc = 0.
        ax.legend(['Train loss', 'Validation loss'], loc = 0)
        ax.set_title('Training/Validation loss per Epoch')
        ax.set_xlabel('Epoch')
        ax.set_ylabel('Loss') 
        plt.plot()
    plot_loss_acc(history)

In [None]:
x_train, x_test, y_train, y_test, colz = data_split(yahoo_data,'best',True)

In [None]:
neural_netter(x_train, x_test, y_train, y_test, colz)

## -------------PRACTICE-------------
1.

<a id='55'></a>
# Weekly Readings/Videos

https://blog.trinket.io/why-python/
    
https://towardsdatascience.com/top-16-python-applications-in-real-world-a0404111ac23

<a id='56'></a>
# Extra Practice