In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd

In [3]:
!pip install -q yfinance

In [4]:
import yfinance as yf

In [5]:
df = pd.read_csv("/content/drive/MyDrive/GOOGL.csv").drop(columns=['time','source'])
df

Unnamed: 0,ticker,headline,date
0,googl,Why Smart Investors Can’t Ignore the Allure of...,2024-04-22
1,googl,What's Going On With Nvidia and AMD Stocks On ...,2024-04-22
2,googl,Microsoft And Oracle Could Be TikTok's 'Likely...,2024-04-22
3,googl,"ETF Winners Amid Last Week's Record Loss in ""M...",2024-04-22
4,googl,RBC Capital Remains a Buy on Alphabet Class A ...,2024-04-22
...,...,...,...
445,googl,Least shorted S&P 500 stocks in May,2024-06-18
446,googl,3 AI Stocks to Buy Following the Nvidia Stock ...,2024-06-18
447,googl,3 Tech Stocks to Buy That Are Growing Dividend...,2024-06-18
448,googl,3 Bargain Stocks to Buy Now: June 2024,2024-06-18


In [6]:
df = df.groupby('date')['headline'].agg(' '.join).reset_index()
df['Date']= pd.to_datetime(df['date'])
df.drop(columns=['date'], inplace = True)

In [7]:
df

Unnamed: 0,headline,Date
0,Why Smart Investors Can’t Ignore the Allure of...,2024-04-22
1,7 Stocks With the Most Potential to Rocket on ...,2024-04-23
2,Google delays phaseout of third-party cookies ...,2024-04-24
3,Destiny Tech100: Why I Wouldn’t Touch DXYZ Sto...,2024-04-25
4,Strong Buy Rating for Alphabet Class A (GOOGL)...,2024-04-26
5,Nvidia Who? 3 AI Plays Destined to Outshine th...,2024-04-27
6,"With rate cuts pushed out, Goldman looks at qu...",2024-04-28
7,"Alphabet Stock Analysis: Buy, Sell, or Hold? W...",2024-04-29
8,JMP Securities Reaffirms Their Buy Rating on A...,2024-04-30
9,Alphabet Stock Analysis: Earnings Bolster the ...,2024-05-01


In [8]:
def process(data):
    data["Tomorrow"] = data["Close"].shift(-1)
    data["target"] = (data["Tomorrow"] > data["Close"]).astype(int)
    # data['P/L'] = data["Close"]-data['Open']
    data.drop(columns=['Volume','Adj Close'], inplace = True)
    return data

def pull_out(stock, data):
    price = pd.DataFrame(yf.download(stock, period="max"))
    # data = data[data.index>= '2023-12-31']
    price.reset_index(inplace = True)
    merged_df = pd.merge(data, price, on='Date', how='outer').dropna()
    return (merged_df)

In [9]:
import spacy
# Load the spaCy English model
nlp = spacy.load('en_core_web_sm')

def tokenize_text(text):
    # Process the text with spaCy
    doc = nlp(text)
    # Extract tokens (lemmatized tokens in this example)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return tokens


In [10]:
data = process(pull_out("GOOGL", df))
data['text'] = data['headline'].apply(tokenize_text)
news = data.copy()
news.head()

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0,headline,Date,Open,High,Low,Close,Tomorrow,target,text
0,Why Smart Investors Can’t Ignore the Allure of...,2024-04-22,154.309998,157.639999,154.059998,156.279999,158.259995,1,"[Smart, investor, ignore, allure, Amazon, Flyw..."
1,7 Stocks With the Most Potential to Rocket on ...,2024-04-23,156.960007,158.970001,156.279999,158.259995,159.130005,1,"[7, stock, Potential, Rocket, Q1, earning]"
2,Google delays phaseout of third-party cookies ...,2024-04-24,157.490005,159.570007,157.169998,159.130005,156.0,0,"[Google, delay, phaseout, party, cookie, Chrom..."
3,Destiny Tech100: Why I Wouldn’t Touch DXYZ Sto...,2024-04-25,151.330002,156.490005,150.869995,156.0,171.949997,1,"[Destiny, Tech100, touch, dxyz, stock, 10, Foo..."
4,Strong Buy Rating for Alphabet Class A (GOOGL)...,2024-04-26,174.369995,174.710007,169.649994,171.949997,166.149994,0,"[Strong, Buy, Rating, Alphabet, Class, GOOGL, ..."


In [11]:
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')
from tqdm.notebook import tqdm

sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [None]:
# Run the polarity score on the entire dataset
res = {}
for i, row in tqdm(news.iterrows(), total=len(news)):
    text = row['headline']
    myid = row['Date']
    res[myid] = sia.polarity_scores(text)

In [13]:
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index': 'Date'})
vaders = vaders.merge(news, how='left')

In [14]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [15]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import torch

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")


def polarity_scores_roberta(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)  # Adjust max_length as per your need

    # Forward pass through the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Get logits and apply softmax
    logits = outputs.logits
    probabilities = softmax(logits, axis=1).flatten()  # Flatten to convert 2D array to 1D

    # Assuming logits order: [negative, positive]
    scores_dict = {
        'roberta_neg': probabilities[0],
        'roberta_pos': probabilities[1]
    }

    return scores_dict

# # Example usage
# example_text = "he is a good boy best"
# scores = polarity_scores_roberta(example_text)
# print(scores)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [16]:
res = {}
for i, row in tqdm(news.iterrows(), total=len(news)):
    # try:
        text = row['headline']
        myid = row['Date']
        vader_result = sia.polarity_scores(text)
        vader_result_rename = {}
        for key, value in vader_result.items():
            vader_result_rename[f"vader_{key}"] = value
        roberta_result = polarity_scores_roberta(text)
        both = {**vader_result_rename, **roberta_result}
        res[myid] = vader_result_rename

  0%|          | 0/41 [00:00<?, ?it/s]

In [17]:
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={'index': 'Date'})
results_df = results_df.merge(news, how='left').dropna()

In [18]:
results_df.tail()

Unnamed: 0,Date,vader_neg,vader_neu,vader_pos,vader_compound,headline,Open,High,Low,Close,Tomorrow,target,text
35,2024-06-11,0.044,0.813,0.143,0.9697,Buy Rating Affirmed: Alphabet’s Market Positio...,176.220001,176.839996,173.770004,176.619995,177.789993,1,"[buy, Rating, Affirmed, Alphabet, Market, Posi..."
36,2024-06-12,0.008,0.809,0.183,0.9834,Alphabet Class A (GOOGL) Receives a Buy from E...,178.25,180.410004,176.110001,177.789993,175.160004,0,"[Alphabet, Class, GOOGL, receive, buy, Evercor..."
37,2024-06-13,0.054,0.797,0.149,0.8885,Screaming Buys- 3 Tech Stocks That Are Just Be...,176.110001,176.740005,174.880005,175.160004,176.789993,1,"[scream, Buys-, 3, Tech, stock, beg, buy, Goog..."
38,2024-06-14,0.0,0.783,0.217,0.9709,3 Stocks to Catapult Your $100K Into a Cool Mi...,174.220001,177.059998,174.149994,176.789993,177.240005,1,"[3, stock, catapult, $, 100, k, Cool, Million,..."
39,2024-06-17,0.015,0.914,0.071,0.8126,The 7 Best Autonomous Driving Stocks to Buy in...,175.460007,178.360001,174.809998,177.240005,175.089996,0,"[7, Best, Autonomous, Driving, stock, buy, Jun..."


In [19]:
X = results_df.drop(columns=['Date','Close','headline','target', 'text','High','Low','Open','vader_pos'])
y=results_df['target']

In [20]:
import gensim.downloader

glove_vectors = gensim.downloader.load('word2vec-google-news-300')




In [21]:
def get_average_word2vec(tokens, model):
    valid_tokens = [token for token in tokens if token in model]
    if not valid_tokens:
        return np.zeros(model.vector_size)
    vectors = [model[token] for token in valid_tokens]
    return np.mean(vectors, axis=0)

# Apply the function to the 'tokens' column to get the embeddings
news['embedding'] = news['text'].apply(lambda x: get_average_word2vec(x, glove_vectors))

In [22]:
# Convert embeddings to separate columns
embedding_df = pd.DataFrame(news['embedding'].to_list(), index=news.index)
embedding_df.columns = [f'embedding_{i}' for i in range(embedding_df.shape[1])]

# Concatenate the embeddings with the original DataFrame
news = pd.concat([news, embedding_df], axis=1).drop(columns=['embedding'])

In [23]:
news.head()

Unnamed: 0,headline,Date,Open,High,Low,Close,Tomorrow,target,text,embedding_0,...,embedding_290,embedding_291,embedding_292,embedding_293,embedding_294,embedding_295,embedding_296,embedding_297,embedding_298,embedding_299
0,Why Smart Investors Can’t Ignore the Allure of...,2024-04-22,154.309998,157.639999,154.059998,156.279999,158.259995,1,"[Smart, investor, ignore, allure, Amazon, Flyw...",0.071254,...,0.017695,0.017291,-0.109096,0.008174,0.098929,-0.064732,-0.00737,0.000527,0.045966,-0.012761
1,7 Stocks With the Most Potential to Rocket on ...,2024-04-23,156.960007,158.970001,156.279999,158.259995,159.130005,1,"[7, stock, Potential, Rocket, Q1, earning]",0.018799,...,0.014893,0.07428,-0.30363,-0.041748,0.053497,-0.067424,0.027547,0.012329,-0.00883,-0.088867
2,Google delays phaseout of third-party cookies ...,2024-04-24,157.490005,159.570007,157.169998,159.130005,156.0,0,"[Google, delay, phaseout, party, cookie, Chrom...",0.020733,...,-0.014528,0.005207,-0.115126,-0.029414,0.01731,-0.035289,-0.017071,0.006647,0.01279,0.021226
3,Destiny Tech100: Why I Wouldn’t Touch DXYZ Sto...,2024-04-25,151.330002,156.490005,150.869995,156.0,171.949997,1,"[Destiny, Tech100, touch, dxyz, stock, 10, Foo...",0.051662,...,-0.009302,0.034577,-0.147928,-0.00476,0.067205,-0.049271,0.009123,-0.005506,0.043809,-0.059121
4,Strong Buy Rating for Alphabet Class A (GOOGL)...,2024-04-26,174.369995,174.710007,169.649994,171.949997,166.149994,0,"[Strong, Buy, Rating, Alphabet, Class, GOOGL, ...",0.038616,...,0.006563,0.027367,-0.150067,-0.014375,0.076558,-0.061882,0.005319,0.032902,0.014871,0.026588


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range = (1,4),max_features= 10000)

In [25]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y,test_size=0.2,random_state=2)

### Import Metrics

In [26]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import confusion_matrix # will plot the confusion matrix
import time
model_performance = pd.DataFrame(columns=['Accuracy','Recall','Precision','F1-Score','time to train','time to predict','total time'])

MultinomailNB

In [28]:
from sklearn.naive_bayes import MultinomialNB
start = time.time()
model =MultinomialNB().fit(X_train,y_train)
end_train = time.time()
y_predictions = model.predict(X_test) # These are the predictions from the test data.
end_predict = time.time()

ValueError: Negative values in data passed to MultinomialNB (input X)

In [29]:
accuracy = accuracy_score(y_test, y_predictions)
recall = recall_score(y_test, y_predictions, average='weighted')
precision = precision_score(y_test, y_predictions, average='weighted')
f1s = f1_score(y_test, y_predictions, average='weighted')

print("Accuracy: "+ "{:.2%}".format(accuracy))
print("Recall: "+ "{:.2%}".format(recall))
print("Precision: "+ "{:.2%}".format(precision))
print("F1-Score: "+ "{:.2%}".format(f1s))
print("time to train: "+ "{:.2f}".format(end_train-start)+" s")
print("time to predict: "+"{:.2f}".format(end_predict-end_train)+" s")
print("total: "+"{:.2f}".format(end_predict-start)+" s")
model_performance.loc['MultiNB'] = [accuracy, recall, precision, f1s,end_train-start,end_predict-end_train,end_predict-start]

NameError: name 'y_predictions' is not defined

KNN

In [30]:
from sklearn.neighbors import KNeighborsClassifier
start = time.time()
model = KNeighborsClassifier().fit(X_train,y_train)
end_train = time.time()
y_predictions = model.predict(X_test) # These are the predictions from the test data.
end_predict = time.time()

In [31]:
accuracy = accuracy_score(y_test, y_predictions)
recall = recall_score(y_test, y_predictions, average='weighted')
precision = precision_score(y_test, y_predictions, average='weighted')
f1s = f1_score(y_test, y_predictions, average='weighted')

print("Accuracy: "+ "{:.2%}".format(accuracy))
print("Recall: "+ "{:.2%}".format(recall))
print("Precision: "+ "{:.2%}".format(precision))
print("F1-Score: "+ "{:.2%}".format(f1s))
print("time to train: "+ "{:.2f}".format(end_train-start)+" s")
print("time to predict: "+"{:.2f}".format(end_predict-end_train)+" s")
print("total: "+"{:.2f}".format(end_predict-start)+" s")
model_performance.loc['KNN'] = [accuracy, recall, precision, f1s,end_train-start,end_predict-end_train,end_predict-start]

Accuracy: 75.00%
Recall: 75.00%
Precision: 82.14%
F1-Score: 70.83%
time to train: 0.00 s
time to predict: 0.00 s
total: 0.01 s


SVC

In [32]:
from sklearn.svm import SVC
start = time.time()
model = SVC(kernel='sigmoid', gamma=1.0).fit(X_train,y_train)
end_train = time.time()
y_predictions = model.predict(X_test) # These are the predictions from the test data.
end_predict = time.time()

In [33]:
accuracy = accuracy_score(y_test, y_predictions)
recall = recall_score(y_test, y_predictions, average='weighted')
precision = precision_score(y_test, y_predictions, average='weighted')
f1s = f1_score(y_test, y_predictions, average='weighted')

print("Accuracy: "+ "{:.2%}".format(accuracy))
print("Recall: "+ "{:.2%}".format(recall))
print("Precision: "+ "{:.2%}".format(precision))
print("F1-Score: "+ "{:.2%}".format(f1s))
print("time to train: "+ "{:.2f}".format(end_train-start)+" s")
print("time to predict: "+"{:.2f}".format(end_predict-end_train)+" s")
print("total: "+"{:.2f}".format(end_predict-start)+" s")
model_performance.loc['SVC'] = [accuracy, recall, precision, f1s,end_train-start,end_predict-end_train,end_predict-start]

Accuracy: 62.50%
Recall: 62.50%
Precision: 39.06%
F1-Score: 48.08%
time to train: 0.01 s
time to predict: 0.00 s
total: 0.01 s


  _warn_prf(average, modifier, msg_start, len(result))


Logistic Regression

In [34]:
from sklearn.linear_model import LogisticRegression
start = time.time()
model = LogisticRegression().fit(X_train,y_train)
end_train = time.time()
y_predictions = model.predict(X_test) # These are the predictions from the test data.
end_predict = time.time()

In [35]:
accuracy = accuracy_score(y_test, y_predictions)
recall = recall_score(y_test, y_predictions, average='weighted')
precision = precision_score(y_test, y_predictions, average='weighted')
f1s = f1_score(y_test, y_predictions, average='weighted')

print("Accuracy: "+ "{:.2%}".format(accuracy))
print("Recall: "+ "{:.2%}".format(recall))
print("Precision: "+ "{:.2%}".format(precision))
print("F1-Score: "+ "{:.2%}".format(f1s))
print("time to train: "+ "{:.2f}".format(end_train-start)+" s")
print("time to predict: "+"{:.2f}".format(end_predict-end_train)+" s")
print("total: "+"{:.2f}".format(end_predict-start)+" s")
model_performance.loc['Logistic'] = [accuracy, recall, precision, f1s,end_train-start,end_predict-end_train,end_predict-start]

Accuracy: 62.50%
Recall: 62.50%
Precision: 39.06%
F1-Score: 48.08%
time to train: 0.03 s
time to predict: 0.00 s
total: 0.03 s


  _warn_prf(average, modifier, msg_start, len(result))


Decision Trees


In [36]:
from sklearn.tree import DecisionTreeClassifier
start = time.time()
model = DecisionTreeClassifier().fit(X_train,y_train)
end_train = time.time()
y_predictions = model.predict(X_test) # These are the predictions from the test data.
end_predict = time.time()

In [37]:
accuracy = accuracy_score(y_test, y_predictions)
recall = recall_score(y_test, y_predictions, average='weighted')
precision = precision_score(y_test, y_predictions, average='weighted')
f1s = f1_score(y_test, y_predictions, average='weighted')

print("Accuracy: "+ "{:.2%}".format(accuracy))
print("Recall: "+ "{:.2%}".format(recall))
print("Precision: "+ "{:.2%}".format(precision))
print("F1-Score: "+ "{:.2%}".format(f1s))
print("time to train: "+ "{:.2f}".format(end_train-start)+" s")
print("time to predict: "+"{:.2f}".format(end_predict-end_train)+" s")
print("total: "+"{:.2f}".format(end_predict-start)+" s")
model_performance.loc['Decision Tree'] = [accuracy, recall, precision, f1s,end_train-start,end_predict-end_train,end_predict-start]

Accuracy: 50.00%
Recall: 50.00%
Precision: 50.00%
F1-Score: 50.00%
time to train: 0.00 s
time to predict: 0.00 s
total: 0.01 s


Extra Trees

In [38]:
from sklearn.ensemble import ExtraTreesClassifier
start = time.time()
model_best = ExtraTreesClassifier(random_state=0,n_jobs=-1).fit(X_train,y_train)
end_train = time.time()
y_predictions = model_best.predict(X_test) # These are the predictions from the test data.
end_predict = time.time()

In [39]:
accuracy = accuracy_score(y_test, y_predictions)
recall = recall_score(y_test, y_predictions, average='weighted')
precision = precision_score(y_test, y_predictions, average='weighted')
f1s = f1_score(y_test, y_predictions, average='weighted')

print("Accuracy: "+ "{:.2%}".format(accuracy))
print("Recall: "+ "{:.2%}".format(recall))
print("Precision: "+ "{:.2%}".format(precision))
print("F1-Score: "+ "{:.2%}".format(f1s))
print("time to train: "+ "{:.2f}".format(end_train-start)+" s")
print("time to predict: "+"{:.2f}".format(end_predict-end_train)+" s")
print("total: "+"{:.2f}".format(end_predict-start)+" s")
model_performance.loc['Extra Trees'] = [accuracy, recall, precision, f1s,end_train-start,end_predict-end_train,end_predict-start]

Accuracy: 75.00%
Recall: 75.00%
Precision: 82.14%
F1-Score: 70.83%
time to train: 0.18 s
time to predict: 0.04 s
total: 0.22 s


RandomForest Classifier

In [40]:
from sklearn.ensemble import RandomForestClassifier
start = time.time()
model1 = RandomForestClassifier(n_estimators = 100,n_jobs=-1,random_state=0,bootstrap=True,).fit(X_train,y_train)
end_train = time.time()
y_predictions = model.predict(X_test) # These are the predictions from the test data.
end_predict = time.time()

In [41]:
accuracy = accuracy_score(y_test, y_predictions)
recall = recall_score(y_test, y_predictions, average='weighted')
precision = precision_score(y_test, y_predictions, average='weighted')
f1s = f1_score(y_test, y_predictions, average='weighted')

print("Accuracy: "+ "{:.2%}".format(accuracy))
print("Recall: "+ "{:.2%}".format(recall))
print("Precision: "+ "{:.2%}".format(precision))
print("F1-Score: "+ "{:.2%}".format(f1s))
print("time to train: "+ "{:.2f}".format(end_train-start)+" s")
print("time to predict: "+"{:.2f}".format(end_predict-end_train)+" s")
print("total: "+"{:.2f}".format(end_predict-start)+" s")
model_performance.loc['Random Forest'] = [accuracy, recall, precision, f1s,end_train-start,end_predict-end_train,end_predict-start]

Accuracy: 50.00%
Recall: 50.00%
Precision: 50.00%
F1-Score: 50.00%
time to train: 0.25 s
time to predict: 0.00 s
total: 0.25 s


Gradient Boosting

In [42]:
from sklearn.ensemble import GradientBoostingClassifier
start = time.time()
model = GradientBoostingClassifier().fit(X_train,y_train)
end_train = time.time()
y_predictions = model.predict(X_test) # These are the predictions from the test data.
end_predict = time.time()

In [43]:
accuracy = accuracy_score(y_test, y_predictions)
recall = recall_score(y_test, y_predictions, average='weighted')
precision = precision_score(y_test, y_predictions, average='weighted')
f1s = f1_score(y_test, y_predictions, average='weighted')

print("Accuracy: "+ "{:.2%}".format(accuracy))
print("Recall: "+ "{:.2%}".format(recall))
print("Precision: "+ "{:.2%}".format(precision))
print("F1-Score: "+ "{:.2%}".format(f1s))
print("time to train: "+ "{:.2f}".format(end_train-start)+" s")
print("time to predict: "+"{:.2f}".format(end_predict-end_train)+" s")
print("total: "+"{:.2f}".format(end_predict-start)+" s")
model_performance.loc['Gradient Boosting Classifier'] = [accuracy, recall, precision, f1s,end_train-start,end_predict-end_train,end_predict-start]

Accuracy: 62.50%
Recall: 62.50%
Precision: 39.06%
F1-Score: 48.08%
time to train: 0.08 s
time to predict: 0.00 s
total: 0.09 s


  _warn_prf(average, modifier, msg_start, len(result))


Neural Network MLP

In [44]:
from sklearn.neural_network import MLPClassifier
start = time.time()
model = MLPClassifier(hidden_layer_sizes = (20,20,),
                      activation='relu',
                      solver='adam',
                      batch_size=2000,
                      verbose=0).fit(X_train,y_train)
end_train = time.time()
y_predictions = model.predict(X_test) # These are the predictions from the test data.
end_predict = time.time()



In [45]:
accuracy = accuracy_score(y_test, y_predictions)
recall = recall_score(y_test, y_predictions, average='weighted')
precision = precision_score(y_test, y_predictions, average='weighted')
f1s = f1_score(y_test, y_predictions, average='weighted')

print("Accuracy: "+ "{:.2%}".format(accuracy))
print("Recall: "+ "{:.2%}".format(recall))
print("Precision: "+ "{:.2%}".format(precision))
print("F1-Score: "+ "{:.2%}".format(f1s))
print("time to train: "+ "{:.2f}".format(end_train-start)+" s")
print("time to predict: "+"{:.2f}".format(end_predict-end_train)+" s")
print("total: "+"{:.2f}".format(end_predict-start)+" s")
model_performance.loc['MLP'] = [accuracy, recall, precision, f1s,end_train-start,end_predict-end_train,end_predict-start]

Accuracy: 62.50%
Recall: 62.50%
Precision: 39.06%
F1-Score: 48.08%
time to train: 0.03 s
time to predict: 0.00 s
total: 0.03 s


  _warn_prf(average, modifier, msg_start, len(result))


Neural Networks MLP classifier

In [46]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, GRU
!pip install keras_metrics
from keras import metrics
import keras_metrics as km
import keras
import numpy as np
from numpy import array

Collecting keras_metrics
  Downloading keras_metrics-1.1.0-py2.py3-none-any.whl (5.6 kB)
Installing collected packages: keras_metrics
Successfully installed keras_metrics-1.1.0


In [47]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [48]:
#Build the feed forward neural network model
def build_model():
    model = Sequential()
    model.add(Dense(20, input_dim=4, activation='relu'))
    model.add(Dense(20, activation='relu'))
    model.add(Dense(20, activation='softmax')) #for multiclass classification
    #Compile the model
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',
                  metrics=['accuracy',f1_m,precision_m, recall_m]
                 )
    return model

#institate the model
model = build_model()

#fit the model
start = time.time()
model.fit(X_train, y_train, epochs=50, batch_size=16,verbose=2)
end_train = time.time()

Epoch 1/50
2/2 - 10s - loss: 29.2018 - accuracy: 0.2812 - f1_m: 0.8360 - precision_m: 0.7188 - recall_m: 1.0000 - 10s/epoch - 5s/step
Epoch 2/50
2/2 - 0s - loss: 26.4250 - accuracy: 0.2812 - f1_m: 0.8267 - precision_m: 0.7188 - recall_m: 1.0000 - 37ms/epoch - 18ms/step
Epoch 3/50
2/2 - 0s - loss: 23.9742 - accuracy: 0.2812 - f1_m: 0.8360 - precision_m: 0.7188 - recall_m: 1.0000 - 50ms/epoch - 25ms/step
Epoch 4/50
2/2 - 0s - loss: 21.4578 - accuracy: 0.2812 - f1_m: 0.8329 - precision_m: 0.7188 - recall_m: 1.0000 - 35ms/epoch - 18ms/step
Epoch 5/50
2/2 - 0s - loss: 18.8733 - accuracy: 0.2812 - f1_m: 0.8360 - precision_m: 0.7188 - recall_m: 1.0000 - 67ms/epoch - 33ms/step
Epoch 6/50
2/2 - 0s - loss: 16.3173 - accuracy: 0.2812 - f1_m: 0.8329 - precision_m: 0.7188 - recall_m: 1.0000 - 97ms/epoch - 48ms/step
Epoch 7/50
2/2 - 0s - loss: 14.2355 - accuracy: 0.2812 - f1_m: 0.8329 - precision_m: 0.7188 - recall_m: 1.0000 - 61ms/epoch - 30ms/step
Epoch 8/50
2/2 - 0s - loss: 12.2532 - accuracy: 0.

In [49]:
#Evaluate the neural network
loss, accuracy, f1s, precision, recall = model.evaluate(X_test, y_test)
end_predict = time.time()
model_performance.loc['MLP (Keras)'] = [accuracy, accuracy, accuracy, accuracy,end_train-start,end_predict-end_train,end_predict-start]



GRU(keras)

In [50]:
def build_model():
    model = Sequential()
    model.add(GRU(20, return_sequences=True,input_shape=(1,4)))
    model.add(GRU(20, return_sequences=True))
    model.add(Dense(10, activation='softmax')) #for multiclass classification
    #Compile the model
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',
                  # metrics=['accuracy',f1_m,precision_m, recall_m]
                  metrics=['accuracy']
                 )
    return model

#The GRU input layer must be 3D.
#The meaning of the 3 input dimensions are: samples, time steps, and features.
#reshape input data
X_train_array = array(X_train) #array has been declared in the previous cell
print(len(X_train_array))
X_train_reshaped = X_train_array.reshape(X_train_array.shape[0],1,4)

#reshape output data
X_test_array=  array(X_test)
X_test_reshaped = X_test_array.reshape(X_test_array.shape[0],1,4)


#institate the model
model = build_model()

start = time.time()
#fit the model
model.fit(X_train_reshaped, y_train, epochs=200, batch_size=16,verbose=2)
end_train = time.time()

32
Epoch 1/200
2/2 - 8s - loss: 2.1955 - accuracy: 0.7188 - 8s/epoch - 4s/step
Epoch 2/200
2/2 - 0s - loss: 2.1657 - accuracy: 0.7188 - 28ms/epoch - 14ms/step
Epoch 3/200
2/2 - 0s - loss: 2.1370 - accuracy: 0.7188 - 29ms/epoch - 14ms/step
Epoch 4/200
2/2 - 0s - loss: 2.1067 - accuracy: 0.7188 - 31ms/epoch - 16ms/step
Epoch 5/200
2/2 - 0s - loss: 2.0789 - accuracy: 0.7188 - 35ms/epoch - 18ms/step
Epoch 6/200
2/2 - 0s - loss: 2.0505 - accuracy: 0.7188 - 35ms/epoch - 18ms/step
Epoch 7/200
2/2 - 0s - loss: 2.0209 - accuracy: 0.7188 - 31ms/epoch - 16ms/step
Epoch 8/200
2/2 - 0s - loss: 1.9922 - accuracy: 0.7188 - 35ms/epoch - 17ms/step
Epoch 9/200
2/2 - 0s - loss: 1.9621 - accuracy: 0.7188 - 27ms/epoch - 14ms/step
Epoch 10/200
2/2 - 0s - loss: 1.9324 - accuracy: 0.7188 - 32ms/epoch - 16ms/step
Epoch 11/200
2/2 - 0s - loss: 1.9024 - accuracy: 0.7188 - 30ms/epoch - 15ms/step
Epoch 12/200
2/2 - 0s - loss: 1.8737 - accuracy: 0.7188 - 36ms/epoch - 18ms/step
Epoch 13/200
2/2 - 0s - loss: 1.8412 -

In [51]:
loss, accuracy = model.evaluate(X_test_reshaped, y_test)
# loss, accuracy, f1s, precision, recall = model.evaluate(X_test_reshaped, y_test)
end_predict = time.time()
model_performance.loc['GRU (Keras)'] = [accuracy, accuracy, accuracy, accuracy, end_train-start,end_predict-end_train,end_predict-start]



LSTM(keras)

In [52]:
def build_model():
    model = Sequential()
    model.add(LSTM(20, return_sequences=True,input_shape=(1,4)))
    model.add(LSTM(20, return_sequences=True))
    model.add(Dense(10, activation='softmax')) #for multiclass classification
    #Compile the model
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',
                  # metrics=['accuracy',f1_m,precision_m, recall_m]
                  metrics=['accuracy']
                 )
    return model

#The LSTM input layer must be 3D.
#The meaning of the 3 input dimensions are: samples, time steps, and features.
#reshape input data
X_train_array = array(X_train) #array has been declared in the previous cell
print(len(X_train_array))
X_train_reshaped = X_train_array.reshape(X_train_array.shape[0],1,4)

#reshape output data
X_test_array=  array(X_test)
X_test_reshaped = X_test_array.reshape(X_test_array.shape[0],1,4)


#institate the model
model = build_model()


#fit the model
start = time.time()
model.fit(X_train_reshaped, y_train, epochs=200, batch_size=16,verbose=2)
end_train = time.time()

32
Epoch 1/200
2/2 - 10s - loss: 2.2825 - accuracy: 0.0000e+00 - 10s/epoch - 5s/step
Epoch 2/200
2/2 - 0s - loss: 2.2638 - accuracy: 0.0000e+00 - 20ms/epoch - 10ms/step
Epoch 3/200
2/2 - 0s - loss: 2.2452 - accuracy: 0.1250 - 16ms/epoch - 8ms/step
Epoch 4/200
2/2 - 0s - loss: 2.2266 - accuracy: 0.2812 - 17ms/epoch - 9ms/step
Epoch 5/200
2/2 - 0s - loss: 2.2081 - accuracy: 0.2812 - 16ms/epoch - 8ms/step
Epoch 6/200
2/2 - 0s - loss: 2.1895 - accuracy: 0.2812 - 17ms/epoch - 8ms/step
Epoch 7/200
2/2 - 0s - loss: 2.1709 - accuracy: 0.2812 - 14ms/epoch - 7ms/step
Epoch 8/200
2/2 - 0s - loss: 2.1521 - accuracy: 0.2812 - 15ms/epoch - 8ms/step
Epoch 9/200
2/2 - 0s - loss: 2.1330 - accuracy: 0.2812 - 17ms/epoch - 8ms/step
Epoch 10/200
2/2 - 0s - loss: 2.1135 - accuracy: 0.2812 - 16ms/epoch - 8ms/step
Epoch 11/200
2/2 - 0s - loss: 2.0937 - accuracy: 0.2812 - 16ms/epoch - 8ms/step
Epoch 12/200
2/2 - 0s - loss: 2.0731 - accuracy: 0.2812 - 16ms/epoch - 8ms/step
Epoch 13/200
2/2 - 0s - loss: 2.0515 -

In [53]:
#Evaluate the neural network
loss, accuracy = model.evaluate(X_test_reshaped, y_test)
# loss, accuracy, f1s, precision, recall = model.evaluate(X_test_reshaped, y_test)
end_predict = time.time()
model_performance.loc['LSTM (Keras)'] = [accuracy, accuracy, accuracy, accuracy,end_train-start,end_predict-end_train,end_predict-start]



In [54]:
model_performance.fillna(.90,inplace=True)
model_performance.style.background_gradient(cmap='coolwarm').format({'Accuracy': '{:.2%}',
                                                                     'Precision': '{:.2%}',
                                                                     'Recall': '{:.2%}',
                                                                     'F1-Score': '{:.2%}',
                                                                     'time to train':'{:.1f}',
                                                                     'time to predict':'{:.1f}',
                                                                     'total time':'{:.1f}',
                                                                     })

Unnamed: 0,Accuracy,Recall,Precision,F1-Score,time to train,time to predict,total time
KNN,75.00%,75.00%,82.14%,70.83%,0.0,0.0,0.0
SVC,62.50%,62.50%,39.06%,48.08%,0.0,0.0,0.0
Logistic,62.50%,62.50%,39.06%,48.08%,0.0,0.0,0.0
Decision Tree,50.00%,50.00%,50.00%,50.00%,0.0,0.0,0.0
Extra Trees,75.00%,75.00%,82.14%,70.83%,0.2,0.0,0.2
Random Forest,50.00%,50.00%,50.00%,50.00%,0.2,0.0,0.3
Gradient Boosting Classifier,62.50%,62.50%,39.06%,48.08%,0.1,0.0,0.1
MLP,62.50%,62.50%,39.06%,48.08%,0.0,0.0,0.0
MLP (Keras),62.50%,62.50%,62.50%,62.50%,13.9,1.2,15.0
GRU (Keras),62.50%,62.50%,62.50%,62.50%,26.3,3.5,29.8
