# Machine Learning Final Project

Gina Nguyen CS 6375

## Introduction:
This stock analysis algorithm aims to approximate a stock's "sentiment" through its google search. The popularity of a company on google searches can be found using google trends, which is access by the pytrends package. Furthmore, the google trend data ranges from 0 to 100, where the high the number is, the more popular the search is. 

In this project, I examine three company stocks, Johnson & Johnson (JNJ), Apple (AAPL), and Vox Royalty Corp (VOXR). Then each stock is analyzed with one regression model(a linear regression) and three different classifiers (SVM, naive bayes, and a neural network model). The goal of the regression is to predict the stock price while the goal of the classifier is to predict if the price will go up, down, or no change.

Notes for this project:
- The algorithm looks at the past 90 days.
- JNJ and AAPL were chosen as stocks because it is own by well-known companies in two different fields, while VOXR was chosen at random from the following website (http://randomstocks.buckmaster.ca/)
- How I imported financial information from yahoo finance (yfinance package): 
    - https://analyzingalpha.com/yfinance-python
    - https://pypi.org/project/yfinance/
- How I imported the change in gooogle searches from google trends (pytrends package): 
    - https://towardsdatascience.com/telling-stories-with-google-trends-using-pytrends-in-python-a11e5b8a177\
    - https://pypi.org/project/pytrends/
- The inspiration for combining the stock history with google trends:
    - https://medium.com/analytics-vidhya/how-to-pd-merge-two-data-frames-on-a-common-date-column-e7808d7ccaee
- What the financial history and google trends looks like for JNJ:
-   https://www.nasdaq.com/market-activity/stocks/jnj/historical
-   https://trends.google.com/trends/explore?date=today%201-m&geo=US&q=%2Fm%2F07zl74p&hl=en

In [None]:
#Importing libraries
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
import yfinance as yf
from datetime import datetime
from pytrends import dailydata
from sklearn.model_selection import train_test_split

class Stock:
    def __init__(self, name, curr_date, past_months):
        self.name = name #Name of the stock
        self.curr_date = curr_date
        self.past_months = past_months

        #Getting the google trend data
        trends = dailydata.get_daily_data(self.name, self.past_months.year, self.past_months.month, self.curr_date.year,self.curr_date.month)
        #Note that for this project, we will be using the 3rd column, a.k.a. the "monthly", as the trend data
        trends_monthly = trends[self.name + '_monthly']
        self.trend = pd.DataFrame(trends_monthly)

        #Getting the financial history
        self.finance = yf.download(self.name, start=self.past_months, end=self.curr_date)

        #Getting a merged dataset combining the stock history with the google trend data
        self.trend.index = pd.to_datetime(self.trend.index)
        self.finance.index = pd.to_datetime(self.finance.index)
        self.merged = pd.merge(self.finance, self.trend, how='inner', left_index=True, right_index=True)
    
    def get_name(self):
        return self.name
    
    def get_stock_info(self):
        stock_info = yf.Ticker(self.name)
        return stock_info

    def get_trend(self):
        return self.trend
    
    def recent_trend(self):
        return self.get_trend().values[-1]
    
    def get_finan_hist(self):
        return self.finance
    
    def recent_price(self):
        return self.get_finan_hist().values[-1,3]
    
    def get_merged(self): #returns a dataframe containing both the stock info and the google trend data
        return self.merged    

def test_regression(X_train, Y_train, X_test, Y_test, model): #Gives the accuracy of a model
    #Seeing how the model does on the training set
    from sklearn.metrics import r2_score
    def rmse(predictions, targets):
            return np.sqrt(((predictions-targets) ** 2).mean())

    y_pred_train = model.predict(X_train)
    rmse_train = rmse(y_pred_train, Y_train)
    r2_train = r2_score(Y_train, y_pred_train)
    print("Training RMSE = " + str(rmse_train))
    print("Training R2 = " + str(r2_train))

    #Seeing how the model does on the test set
    y_pred_test = model.predict(X_test)
    rmse_test = rmse(y_pred_test, Y_test)
    r2_test = r2_score(Y_test, y_pred_test)
    print("Test RMSE = " + str(rmse_test))
    print("Test R2 = " + str(r2_test))

def test_classifier(X_train, Y_train, X_test, Y_test, model):
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import precision_score
    from sklearn.metrics import recall_score
    from sklearn.metrics import f1_score
    
    y_pred_train = model.predict(X_train)
    accuracy = accuracy_score(y_pred_train, Y_train)
    print("Training accuracy: " + str(accuracy))
    y_pred_test = model.predict(X_test)
    accuracy = accuracy_score(y_pred_test, Y_test)
    print("Test accuracy: " + str(accuracy))
    

    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import ConfusionMatrixDisplay
    cm = confusion_matrix(Y_train, y_pred_train)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.show()

def implement_SVM(stock, both):
    #SVM Model looking at all features
    print(stock.get_name(), "SVM model on all features: ")
    #Splitting the dataset
    X = both.drop(columns=['Close', 'Change'])
    Y = both['Change']
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state=42)
    #Training the model
    from sklearn.svm import SVC
    svm_model = SVC()
    svm_model.fit(X_train, Y_train)
    #Viewing the accuracy of the model
    test_classifier(X_train, Y_train, X_test, Y_test, svm_model)

    #SVM Model based all features except the google trends:
    print()
    print(stock.get_name(), "SVM on all features EXCEPT the google trend data: ")
    #Splitting the dataset
    X = both.drop(columns=['Close', 'Change', stock.get_name()+'_monthly', 'ChangeTrend'])
    Y = both['Change']
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state=42)
    #Training the model
    svm_model = SVC()
    svm_model.fit(X_train, Y_train)
    #Viewing the accuracy of the model
    test_classifier(X_train, Y_train, X_test, Y_test, svm_model)

    #SVM Model based all features only the google trends:
    print()
    print(stock.get_name(), "SVM on ONLY the google trend data: ")
    #Splitting the dataset
    X = both[['ChangeTrend']]
    Y = both['Change']
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state=42)
    #Training the model
    svm_model = SVC()
    svm_model.fit(X_train, Y_train)
    #Viewing the accuracy of the model
    test_classifier(X_train, Y_train, X_test, Y_test, svm_model)

def implement_NB(stock, both):
    #Naive Bayes Model looking at all features
    print(stock.get_name(), "Naive Bayes model on all features: ")
    #Splitting the dataset
    X = both.drop(columns=['Close', 'Change'])
    Y = both['Change']
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state=42)
    #Training the model
    from sklearn.naive_bayes import GaussianNB
    bayes_model = GaussianNB()
    bayes_model.fit(X_train, Y_train)
    #Viewing the accuracy of the model
    test_classifier(X_train, Y_train, X_test, Y_test, bayes_model)

    #Naive Bayes Model Model based all features except the google trends:
    print()
    print(stock.get_name(), "Naive Bayes model on all features EXCEPT the google trend data: ")
    #Splitting the dataset
    X = both.drop(columns=['Close', 'Change', stock.get_name()+'_monthly', 'ChangeTrend'])
    Y = both['Change']
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state=42)
    #Training the model
    bayes_model = GaussianNB()
    bayes_model.fit(X_train, Y_train)
    #Viewing the accuracy of the model
    test_classifier(X_train, Y_train, X_test, Y_test, bayes_model)

    #Naive Bayes Model Model based all features only the google trends:
    print()
    print(stock.get_name(), "Naive Bayes model on ONLY the google trend data: ")
    #Splitting the dataset
    X = both[['ChangeTrend']]
    Y = both['Change']
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state=42)
    #Training the model
    bayes_model = GaussianNB()
    bayes_model.fit(X_train, Y_train)
    #Viewing the accuracy of the model
    test_classifier(X_train, Y_train, X_test, Y_test, bayes_model)

def implement_NN(stock, both):
    #Neural Network Model looking at all features
    print(stock.get_name(), "Neural Network model on all features: ")
    #Splitting the dataset
    X = both.drop(columns=['Close', 'Change'])
    Y = both['Change']
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state=42)
    #Training the model
    from sklearn.neural_network import MLPClassifier
    neural_model = MLPClassifier()
    neural_model.fit(X_train, Y_train)
    #Viewing the accuracy of the model
    test_classifier(X_train, Y_train, X_test, Y_test, neural_model)

    #Neural Network Model Model based all features except the google trends:
    print()
    print(stock.get_name(), "Neural Network model on all features EXCEPT the google trend data: ")
    #Splitting the dataset
    X = both.drop(columns=['Close', 'Change', stock.get_name()+'_monthly', 'ChangeTrend'])
    Y = both['Change']
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state=42)
    #Training the model
    neural_model = MLPClassifier()
    neural_model.fit(X_train, Y_train)
    #Viewing the accuracy of the model
    test_classifier(X_train, Y_train, X_test, Y_test, neural_model)

    #Neural Network Model Model based all features only the google trends:
    print()
    print(stock.get_name(), "Neural Network model on ONLY the google trend data: ")
    #Splitting the dataset
    X = both[['ChangeTrend']]
    Y = both['Change']
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state=42)
    #Training the model
    neural_model = MLPClassifier()
    neural_model.fit(X_train, Y_train)
    #Viewing the accuracy of the model
    test_classifier(X_train, Y_train, X_test, Y_test, neural_model)

def implement_ADA(stock, both):
    #AdaBoost Model looking at all features
    print(stock.get_name(), "AdaBoost model on all features: ")
    #Splitting the dataset
    X = both.drop(columns=['Close', 'Change'])
    Y = both['Change']
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state=42)
    #Training the model
    from sklearn.ensemble import AdaBoostClassifier
    ada_model = AdaBoostClassifier()
    ada_model.fit(X_train, Y_train)
    #Viewing the accuracy of the model
    test_classifier(X_train, Y_train, X_test, Y_test, ada_model)

    #AdaBoost Model based all features except the google trends:
    print()
    print(stock.get_name(), "AdaBoost on all features EXCEPT the google trend data: ")
    #Splitting the dataset
    X = both.drop(columns=['Close', 'Change', stock.get_name()+'_monthly', 'ChangeTrend'])
    Y = both['Change']
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state=42)
    #Training the model
    ada_model = AdaBoostClassifier()
    ada_model.fit(X_train, Y_train)
    #Viewing the accuracy of the model
    test_classifier(X_train, Y_train, X_test, Y_test, ada_model)

    #AdaBoost Model based all features only the google trends:
    print()
    print(stock.get_name(), "AdaBoost on ONLY the google trend data: ")
    #Splitting the dataset
    X = both[['ChangeTrend']]
    Y = both['Change']
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state=42)
    #Training the model
    ada_model = AdaBoostClassifier()
    ada_model.fit(X_train, Y_train)
    #Viewing the accuracy of the model
    test_classifier(X_train, Y_train, X_test, Y_test, ada_model)

date_range = 90 #tells us how many days ago you want to look at data
curr_date = datetime.now() #gets the current date
past_months = datetime.fromtimestamp(curr_date.timestamp() - (date_range*24*60*60)) #gets the date of [date_range] days ago (date_range*hours*mins*secs)

## Looking at Johnson and Johnson Stock

In [None]:
#Testing on Johnson & Johnson, whose stock name is JNJ:
jnj = Stock("JNJ", curr_date, past_months)
print("Most recent google trend data for JNJ: ", jnj.recent_trend())
print("Most recent stock price for JNJ: ", jnj.recent_price())

#Viewing JNJ stock as a correlation matrix
both_jnj = jnj.get_merged()
correlation_matrix = both_jnj.corr().round(2)
sns.heatmap(data=correlation_matrix, annot=True)

### Testing Regression Models

Looking at the correlation matrix above, you can see the the google trend (a.k.a. "JNJ_monthly") has a low correlation with the other features. In comparison with the results of the linear regression below, the google trends low correlation seems to lower the regression accuracy. 

Why does this make sense? Because stock prices do not change as quickly as google trend searches and, importantly, stock prices tend to depend on the prior prices. Because of this, we also look at classifying models as well as regression models

In [24]:
#Linear Regression Model looking at all features
print("JNJ Linear Regression model on all features: ")
#Splitting the dataset
X = both_jnj.drop(columns=['Close'])
Y = both_jnj['Close']

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state=42)
#Training the model
from sklearn.linear_model import LinearRegression
lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)
#Viewing the accuracy of the model
test_regression(X_train, Y_train, X_test, Y_test, lin_model)

#Linear Regression Model based all features except the google trends:
print()
print("JNJ Linear Regression model on all features EXCEPT the google trend data: ")
#Splitting the dataset
X = both_jnj.drop(columns=['Close', 'JNJ_monthly'])
Y = both_jnj['Close']
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state=42)
#Training the model
lin_model_trend = LinearRegression()
lin_model_trend.fit(X_train, Y_train)
#Viewing the accuracy of the model
test_regression(X_train, Y_train, X_test, Y_test, lin_model_trend)


JNJ Linear Regression model on all features: 
Training RMSE = 0.23608052897459428
Training R2 = 0.9976929445745066
Test RMSE = 0.33963443064389437
Test R2 = 0.9951067418474061

JNJ Linear Regression model on all features EXCEPT the google trend data: 
Training RMSE = 0.23721078591007025
Training R2 = 0.9976708012204196
Test RMSE = 0.337519179806825
Test R2 = 0.9951675026890513


### Testing on Classifier Models:

Before we can use a classifier model, we need to determine what we are classifying. In this case, I would like to guess if a stock price is going to increase or decrease. Because of this, we are going to compare the "current" stock price with its previous, where an increase is classified as 1, a decrease is classified as -1, and no change in price is classified as 0. Furthermore, the google trend data is converted to "a change in google trends".

When looking at the below results, you will find that look at only the google trend data has the best prediction on if a stock has a price increase/decrease. However, you will also notice that the accuracy is much lower than the regression. This is presumably because predicting if a stock price is going to increase/decrease is a difficult task. Only having a stocks financial history and its google trends result is NOT enough to tell the changes in price. However, since my predictions with google trends are more than 50%, I will mentally take this as a win.

In [None]:
#Finding if the stock prices are increasing or decreasing and adding it to a column called "Change"
both_jnj['Change'] = 0
prev_row = None
for index, row in both_jnj.iloc[1:].iterrows():
    curr_price = row['Close']
    if prev_row is None:
        prev_row = curr_price
    price_diff = curr_price - prev_row
    if (price_diff > 0):
        row['Change'] = 1 #Increased price
    elif (price_diff < 0):
        row['Change'] = -1 #Decreased price
    else:
        row['Change'] = 0 #No change
    both_jnj.loc[index,'Change']=row['Change']
    
    prev_row = curr_price

#Finding if the google trend is increasing or decreasing and adding it to a column called "ChangeTrend"
both_jnj['ChangeTrend'] = 0
prev_row = None
for index, row in both_jnj.iloc[1:].iterrows():
    curr_trend = row['JNJ_monthly']
    if prev_row is None:
        prev_row = curr_trend
    trend_diff = curr_trend - prev_row
    if (trend_diff > 0):
        row['ChangeTrend'] = 1 #Increased price
    elif (trend_diff < 0):
        row['ChangeTrend'] = -1 #Decreased price
    else:
        row['ChangeTrend'] = 0 #No change
    both_jnj.loc[index,'ChangeTrend']=row['ChangeTrend']
    
    prev_row = curr_trend

In [None]:
#SVM
implement_SVM(jnj, both_jnj)

In [None]:
#Naive Bayes
implement_NB(jnj, both_jnj)

In [None]:
#Neural Network
implement_NB(jnj, both_jnj)

In [None]:
#AdaBoost
implement_ADA(jnj, both_jnj) #I was smart this time and wrote a function

## Looking at Apple Stock
So far, we've looked at the Johnson and Johnson stock (JNJ) which belongs to a healthcare and pharmeceuticals company. In contrast, Apple stock (AAPL) belongs to a tech company.

In [None]:
#Testing on Apple, whose stock name is AAPL
aapl = Stock("AAPL", curr_date, past_months)
print("Most recent google trend data for AAPL: ", aapl.recent_trend())
print("Most recent stock price for AAPL: ", aapl.recent_price())

#Viewing AAPL Stock as a correlation matrix
both_aapl = aapl.get_merged()
correlation_matrix = both_aapl.corr().round(2)
sns.heatmap(data=correlation_matrix, annot=True)

In [None]:
#Linear Regression Model looking at all features
print("AAPL Linear Regression model on all features: ")
#Splitting the dataset
X = both_aapl.drop(columns=['Close'])
Y = both_aapl['Close']

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state=42)
#Training the model
from sklearn.linear_model import LinearRegression
lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)
#Viewing the accuracy of the model
test_regression(X_train, Y_train, X_test, Y_test, lin_model)

#Linear Regression Model based all features except the google trends:
print()
print("AAPL Linear Regression model on all features EXCEPT the google trend data: ")
#Splitting the dataset
X = both_aapl.drop(columns=['Close', 'AAPL_monthly'])
Y = both_aapl['Close']
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state=42)
#Training the model
lin_model_trend = LinearRegression()
lin_model_trend.fit(X_train, Y_train)
#Viewing the accuracy of the model
test_regression(X_train, Y_train, X_test, Y_test, lin_model_trend)

In [None]:
#Finding if the stock prices are increasing or decreasing and adding it to a column called "Change"
both_aapl['Change'] = 0
prev_row = None
for index, row in both_aapl.iloc[1:].iterrows():
    curr_price = row['Close']
    if prev_row is None:
        prev_row = curr_price
    price_diff = curr_price - prev_row
    if (price_diff > 0):
        row['Change'] = 1 #Increased price
    elif (price_diff < 0):
        row['Change'] = -1 #Decreased price
    else:
        row['Change'] = 0 #No change
    both_aapl.loc[index,'Change']=row['Change']
    
    prev_row = curr_price

#Finding if the google trend is increasing or decreasing and adding it to a column called "ChangeTrend"
both_aapl['ChangeTrend'] = 0
prev_row = None
for index, row in both_aapl.iloc[1:].iterrows():
    curr_trend = row['AAPL_monthly']
    if prev_row is None:
        prev_row = curr_trend
    trend_diff = curr_trend - prev_row
    if (trend_diff > 0):
        row['ChangeTrend'] = 1 #Increased price
    elif (trend_diff < 0):
        row['ChangeTrend'] = -1 #Decreased price
    else:
        row['ChangeTrend'] = 0 #No change
    both_aapl.loc[index,'ChangeTrend']=row['ChangeTrend']
    
    prev_row = curr_trend

In [None]:
#SVM
implement_SVM(aapl, both_aapl)

In [None]:
#Naive Bayes
implement_NB(aapl, both_aapl)

In [None]:
#Neural Network Model
implement_NN(aapl, both_aapl)

In [None]:
#AdaBoost
implement_ADA(aapl, both_aapl)

## Looking at VOXR Stock (from a presumably less known company)

In [None]:
#Testing on VOXR Stock
voxr = Stock("VOXR", curr_date, past_months)
print("Most recent google trend data for VOXR: ", voxr.recent_trend())
print("Most recent stock price for VOXR: ", voxr.recent_price())

#Viewing AAPL Stock as a correlation matrix
both_voxr = voxr.get_merged()
correlation_matrix = both_voxr.corr().round(2)
sns.heatmap(data=correlation_matrix, annot=True)

In [None]:
#Linear Regression Model looking at all features
print("VOXR Linear Regression model on all features: ")
#Splitting the dataset
X = both_voxr.drop(columns=['Close'])
Y = both_voxr['Close']

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state=42)
#Training the model
from sklearn.linear_model import LinearRegression
lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)
#Viewing the accuracy of the model
test_regression(X_train, Y_train, X_test, Y_test, lin_model)

#Linear Regression Model based all features except the google trends:
print()
print("VOXR Linear Regression model on all features EXCEPT the google trend data: ")
#Splitting the dataset
X = both_voxr.drop(columns=['Close', 'VOXR_monthly'])
Y = both_voxr['Close']
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state=42)
#Training the model
lin_model_trend = LinearRegression()
lin_model_trend.fit(X_train, Y_train)
#Viewing the accuracy of the model
test_regression(X_train, Y_train, X_test, Y_test, lin_model_trend)

In [None]:
#Finding if the stock prices are increasing or decreasing and adding it to a column called "Change"
both_voxr['Change'] = 0
prev_row = None
for index, row in both_voxr.iloc[1:].iterrows():
    curr_price = row['Close']
    if prev_row is None:
        prev_row = curr_price
    price_diff = curr_price - prev_row
    if (price_diff > 0):
        row['Change'] = 1 #Increased price
    elif (price_diff < 0):
        row['Change'] = -1 #Decreased price
    else:
        row['Change'] = 0 #No change
    both_voxr.loc[index,'Change']=row['Change']
    
    prev_row = curr_price

#Finding if the google trend is increasing or decreasing and adding it to a column called "ChangeTrend"
both_voxr['ChangeTrend'] = 0
prev_row = None
for index, row in both_voxr.iloc[1:].iterrows():
    curr_trend = row['VOXR_monthly']
    if prev_row is None:
        prev_row = curr_trend
    trend_diff = curr_trend - prev_row
    if (trend_diff > 0):
        row['ChangeTrend'] = 1 #Increased price
    elif (trend_diff < 0):
        row['ChangeTrend'] = -1 #Decreased price
    else:
        row['ChangeTrend'] = 0 #No change
    both_voxr.loc[index,'ChangeTrend']=row['ChangeTrend']
    
    prev_row = curr_trend

In [None]:
#SVM Model
implement_SVM(voxr, both_voxr)

In [None]:
#Naive Bayes Model
implement_NB(voxr, both_voxr)

In [None]:
#Neural Network Model
implement_NN(voxr, both_voxr)

In [None]:
#AdaBoost
implement_ADA(voxr, both_voxr)