## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
# Importing News Articles data
news_articles = pd.read_csv('News_articles_dataset.csv', parse_dates=['Date'])
news_articles.head()

Unnamed: 0,Date,Headlines
0,2015-01-01,What Can We Expect From Apple Inc. In 2015? 1 ...
1,2015-02-01,What to expect from Apple in 2015 beyond its s...
2,2015-03-01,Mountie: An inexpensive and innovative way to ...
3,2015-04-01,"Donald Yacktman on the Sources of Moats, His C..."
4,2015-05-01,"SIM-free iPhone 6, 6 Plus reportedly debuting ..."


In [3]:
news_articles.isnull().sum()

Date         0
Headlines    0
dtype: int64

In [4]:
# Importing Historical data of the stocks
historical_data = pd.read_csv('HistoricalData_APPLE.csv', parse_dates=['Date'])
historical_data.head()

Unnamed: 0,Date,Close/Last,Volume,Open,High,Low
0,2021-06-09,$127.13,56877940,$127.21,$127.75,$126.52
1,2021-06-08,$126.74,74403770,$126.60,$128.46,$126.21
2,2021-06-07,$125.90,71057550,$126.17,$126.32,$124.83
3,2021-06-04,$125.89,75169340,$124.07,$126.16,$123.85
4,2021-06-03,$123.54,76229170,$124.68,$124.85,$123.13


## Making necessary changes to the datasets and merging them together

In [5]:
# Sorting the Historical data according to 'Date' column
historical_data.sort_values('Date',inplace=True)
historical_data.head()

Unnamed: 0,Date,Close/Last,Volume,Open,High,Low
2515,2011-06-10,$11.64,433801306,$11.81,$11.85,$11.63
2514,2011-06-13,$11.66,329376468,$11.69,$11.73,$11.61
2513,2011-06-14,$11.87,333995906,$11.79,$11.90,$11.76
2512,2011-06-15,$11.67,395841722,$11.78,$11.80,$11.60
2511,2011-06-16,$11.61,507299317,$11.68,$11.74,$11.37


In [6]:
# Reset the index
historical_data.reset_index()

Unnamed: 0,index,Date,Close/Last,Volume,Open,High,Low
0,2515,2011-06-10,$11.64,433801306,$11.81,$11.85,$11.63
1,2514,2011-06-13,$11.66,329376468,$11.69,$11.73,$11.61
2,2513,2011-06-14,$11.87,333995906,$11.79,$11.90,$11.76
3,2512,2011-06-15,$11.67,395841722,$11.78,$11.80,$11.60
4,2511,2011-06-16,$11.61,507299317,$11.68,$11.74,$11.37
...,...,...,...,...,...,...,...
2511,4,2021-06-03,$123.54,76229170,$124.68,$124.85,$123.13
2512,3,2021-06-04,$125.89,75169340,$124.07,$126.16,$123.85
2513,2,2021-06-07,$125.90,71057550,$126.17,$126.32,$124.83
2514,1,2021-06-08,$126.74,74403770,$126.60,$128.46,$126.21


In [7]:
historical_data.columns

Index(['Date', 'Close/Last', 'Volume', 'Open', 'High', 'Low'], dtype='object')

In [8]:
# Merging the 2 datasets news_articles and historical_data into one dataset
dataset = pd.merge(news_articles, historical_data, how='inner', on=['Date'])
dataset.head()

Unnamed: 0,Date,Headlines,Close/Last,Volume,Open,High,Low
0,2015-04-01,"Donald Yacktman on the Sources of Moats, His C...",$31.06,161852560,$31.21,$31.28,$30.78
1,2015-05-01,"SIM-free iPhone 6, 6 Plus reportedly debuting ...",$32.24,229396000,$31.53,$32.53,$31.33
2,2015-06-01,"Technology Transforming Cars Into ""Phones On W...",$32.63,128064080,$32.80,$32.85,$32.51
3,2015-07-01,Apple issues fourth developer beta of OS X 10....,$31.65,120827560,$31.73,$31.74,$31.50
4,2015-09-01,Samsung is reportedly making a secret new chip...,$26.93,306248680,$27.54,$27.97,$26.84


In [9]:
# Checking Datatypes of all the columns of the dataset dataframe
dataset.dtypes

Date          datetime64[ns]
Headlines             object
Close/Last            object
Volume                 int64
Open                  object
High                  object
Low                   object
dtype: object

In [10]:
# Removing $ symbol from Close, Open, High and Low columns to convert them to float values
dataset['Close/Last'] = list(map(lambda x: x[1:], dataset['Close/Last'].values))
dataset['Open'] = list(map(lambda x: x[1:], dataset['Open'].values))
dataset['High'] = list(map(lambda x: x[1:], dataset['High'].values))
dataset['Low'] = list(map(lambda x: x[1:], dataset['Low'].values))

# Converting the above columns to float dataype
dataset['Close/Last'] = dataset['Close/Last'].astype(float)
dataset['Open'] = dataset['Open'].astype(float)
dataset['High'] = dataset['High'].astype(float)
dataset['Low'] = dataset['Low'].astype(float)

In [11]:
dataset.head()

Unnamed: 0,Date,Headlines,Close/Last,Volume,Open,High,Low
0,2015-04-01,"Donald Yacktman on the Sources of Moats, His C...",31.06,161852560,31.21,31.28,30.78
1,2015-05-01,"SIM-free iPhone 6, 6 Plus reportedly debuting ...",32.24,229396000,31.53,32.53,31.33
2,2015-06-01,"Technology Transforming Cars Into ""Phones On W...",32.63,128064080,32.8,32.85,32.51
3,2015-07-01,Apple issues fourth developer beta of OS X 10....,31.65,120827560,31.73,31.74,31.5
4,2015-09-01,Samsung is reportedly making a secret new chip...,26.93,306248680,27.54,27.97,26.84


In [12]:
# Changing index column to 'Date'
dataset.set_index('Date', inplace = True)
dataset.head()

Unnamed: 0_level_0,Headlines,Close/Last,Volume,Open,High,Low
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-04-01,"Donald Yacktman on the Sources of Moats, His C...",31.06,161852560,31.21,31.28,30.78
2015-05-01,"SIM-free iPhone 6, 6 Plus reportedly debuting ...",32.24,229396000,31.53,32.53,31.33
2015-06-01,"Technology Transforming Cars Into ""Phones On W...",32.63,128064080,32.8,32.85,32.51
2015-07-01,Apple issues fourth developer beta of OS X 10....,31.65,120827560,31.73,31.74,31.5
2015-09-01,Samsung is reportedly making a secret new chip...,26.93,306248680,27.54,27.97,26.84


In [13]:
# Sorting the index in ascending order of date
dataset.sort_index(ascending = True, inplace = True)
dataset.head()

Unnamed: 0_level_0,Headlines,Close/Last,Volume,Open,High,Low
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-01-02,"Apple Loop: iOS 8.1.3 Angers Users, Outlook Ar...",27.33,212575080,27.85,27.86,26.84
2015-01-05,"Apple Watch interest described as &quot;tepid,...",26.56,256843520,27.07,27.16,26.35
2015-01-06,Apple&quot;s TV Service Delayed: Getting Local...,26.57,262729000,26.64,26.86,26.16
2015-01-07,Apple Inc. leaks new iPod colors in iTunes 12....,26.94,159933400,26.8,27.05,26.67
2015-01-08,Nomura Begins Coverage on Apple (AAPL) IBM cou...,27.97,236675040,27.31,28.04,27.18


In [14]:
len(dataset)

1595

In [15]:
# calculate the diff between previous and present days closed price of the stocks
dataset['close_price_diff'] = ''
for i in range(0, len(dataset) - 1):
  dataset['close_price_diff'][i+1] = dataset['Close/Last'][i+1] - dataset['Close/Last'][i]

dataset

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0_level_0,Headlines,Close/Last,Volume,Open,High,Low,close_price_diff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-01-02,"Apple Loop: iOS 8.1.3 Angers Users, Outlook Ar...",27.33,212575080,27.85,27.86,26.84,
2015-01-05,"Apple Watch interest described as &quot;tepid,...",26.56,256843520,27.07,27.16,26.35,-0.77
2015-01-06,Apple&quot;s TV Service Delayed: Getting Local...,26.57,262729000,26.64,26.86,26.16,0.01
2015-01-07,Apple Inc. leaks new iPod colors in iTunes 12....,26.94,159933400,26.80,27.05,26.67,0.37
2015-01-08,Nomura Begins Coverage on Apple (AAPL) IBM cou...,27.97,236675040,27.31,28.04,27.18,1.03
...,...,...,...,...,...,...,...
2021-05-28,Apple pushes back launch of podcast subscripti...,124.61,71311110,125.57,125.80,124.55,-0.67
2021-06-01,"TSX nears record high as cannabis, renewables ...",124.28,67637120,125.08,125.35,123.94,-0.33
2021-06-02,Exclusive Social Media App Clubhouse Rises In ...,125.06,59278860,124.28,125.24,124.05,0.78
2021-06-03,Two Patents were fulfilled last week as the Ma...,123.54,76229170,124.68,124.85,123.13,-1.52


In [16]:
dataset.dtypes

Headlines            object
Close/Last          float64
Volume                int64
Open                float64
High                float64
Low                 float64
close_price_diff     object
dtype: object

In [17]:
# Changing the datatype of close_price_diff column to float
dataset['close_price_diff'] = pd.to_numeric(dataset['close_price_diff'], errors='coerce')

In [18]:
# Impact column represent 'Profit' for value 1 and 'Loss' for value 0
dataset['Impact'] = ''
dataset['Impact'] = pd.to_numeric(dataset['Impact'], errors='coerce')
dataset['Impact'] = dataset['close_price_diff'].apply(lambda x: 1 if x > 0 else 0)

In [19]:
dataset.head()

Unnamed: 0_level_0,Headlines,Close/Last,Volume,Open,High,Low,close_price_diff,Impact
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-01-02,"Apple Loop: iOS 8.1.3 Angers Users, Outlook Ar...",27.33,212575080,27.85,27.86,26.84,,0
2015-01-05,"Apple Watch interest described as &quot;tepid,...",26.56,256843520,27.07,27.16,26.35,-0.77,0
2015-01-06,Apple&quot;s TV Service Delayed: Getting Local...,26.57,262729000,26.64,26.86,26.16,0.01,1
2015-01-07,Apple Inc. leaks new iPod colors in iTunes 12....,26.94,159933400,26.8,27.05,26.67,0.37,1
2015-01-08,Nomura Begins Coverage on Apple (AAPL) IBM cou...,27.97,236675040,27.31,28.04,27.18,1.03,1


In [20]:
dataset.dtypes

Headlines            object
Close/Last          float64
Volume                int64
Open                float64
High                float64
Low                 float64
close_price_diff    float64
Impact                int64
dtype: object

In [21]:
# Dropping the dataset where the closed price difference is null
dataset.drop(dataset.index[[0]], inplace=True)
dataset.head()

Unnamed: 0_level_0,Headlines,Close/Last,Volume,Open,High,Low,close_price_diff,Impact
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-01-05,"Apple Watch interest described as &quot;tepid,...",26.56,256843520,27.07,27.16,26.35,-0.77,0
2015-01-06,Apple&quot;s TV Service Delayed: Getting Local...,26.57,262729000,26.64,26.86,26.16,0.01,1
2015-01-07,Apple Inc. leaks new iPod colors in iTunes 12....,26.94,159933400,26.8,27.05,26.67,0.37,1
2015-01-08,Nomura Begins Coverage on Apple (AAPL) IBM cou...,27.97,236675040,27.31,28.04,27.18,1.03,1
2015-01-09,Google&quot;s (GOOGL) Self-Driving Cars to Hit...,28.0,214582920,28.17,28.31,27.55,0.03,1


## Cleaning the texts

In [22]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, len(dataset)):
  headlines = re.sub('[^a-zA-Z]', ' ', dataset['Headlines'][i])
  headlines = headlines.lower()
  headlines = headlines.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  headlines = [ps.stem(word) for word in headlines if not word in set(all_stopwords)]
  headlines = ' '.join(headlines)
  corpus.append(headlines)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
print(len(corpus))

1594


## Creating the Bag of Words model

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 20000)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

## Splitting the dataset into the Training set and Test set

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

## Training the Naive Bayes model on the Training set

In [34]:
from sklearn.naive_bayes import BernoulliNB
classifier = BernoulliNB()
classifier.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

## Predicting the Test set results

In [35]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [0 0]
 [1 1]
 [0 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [0 1]
 [0 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [1 1]
 [0 1]
 [0 1]
 [1 0]
 [0 0]
 [1 0]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [0 1]
 [1 1]
 [1 0]
 [0 1]
 [1 0]
 [0 0]
 [0 1]
 [1 1]
 [1 0]
 [0 1]
 [0 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [1 1]
 [0 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 0]
 [1 0]
 [0 0]
 [1 0]
 [0 1]
 [1 0]
 [1 1]
 [1 1]
 [0 1]
 [1 1]
 [1 0]
 [1 1]
 [0 1]
 [0 1]
 [0 0]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [0 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [1 0]
 [0 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [1 0]
 [1 0]
 [0 1]
 [0 0]
 [1 1]
 [1 0]
 [1 0]
 [1 1]
 [0 1]
 [1 1]
 [1 1]

## Making the Confusion Matrix

In [36]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[ 63  91]
 [ 64 101]]


0.5141065830721003

## Hyperparameter Tuning

In [29]:
# Applying K-fold cross validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Mean Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Mean Accuracy: 48.55 %
Standard Deviation: 5.70 %


In [30]:
# Applying GridSearchCv for best parameters
from sklearn.model_selection import GridSearchCV
parameters = {'alpha': [0.25, 0.5, 0.75, 1], 'binarize': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1], 'fit_prior': [True, False]}
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 52.78 %
Best Parameters: {'alpha': 0.25, 'binarize': 1, 'fit_prior': True}


### After Hyperparamter tuning

In [37]:
from sklearn.naive_bayes import BernoulliNB
classifier = BernoulliNB(alpha = 0.25, binarize = 1, fit_prior = True)
classifier.fit(X_train, y_train)

BernoulliNB(alpha=0.25, binarize=1, class_prior=None, fit_prior=True)

In [39]:
y_pred = classifier.predict(X_test)

In [40]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[77 77]
 [66 99]]


0.5517241379310345