In [1]:
# Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import re
import nltk
from nltk.corpus import stopwords # importing 'stopwords' to this notebook
from nltk.stem.porter import PorterStemmer ## stemming of words

import joblib

In [2]:
data= pd.read_csv('../input/stock-price-and-news-realted-to-it/AppleNewsStock.csv')
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,News
0,2006-12-01,13.114285,13.19,12.871428,91.32,13.045714,198769900,WHAT'S ON TONIGHT : 8 P.M. (TLC) ASHLEY JUDD A...
1,2006-12-04,13.125714,13.15,12.928572,91.120003,13.017143,177384200,More on Housing Prices : The broadest governme...
2,2006-12-05,13.092857,13.19,12.981428,91.269997,13.038571,165709600,
3,2006-12-06,12.948571,13.055715,12.81,89.830002,12.832857,159546100,Honoring R.W. Apple in Words and Food : About ...
4,2006-12-07,12.861428,12.928572,12.414286,87.040001,12.434286,251206900,"Homebuilders, and Worries Over Jobs, Lead a De..."


# Data Prepration

In [3]:
# Assigning labels to the entries
label_list=[]
for i in range(1, len(data)):
    if data['Adj Close'][i] - data['Adj Close'][i-1]>= 0:
        label_list.append(1)
    else:
        label_list.append(0)
data1= data.iloc[1: ]
data1.insert(2, "Label", label_list)

data_new= data1[["Label", "News"]] # Making data with just news and labels

In [4]:
display(data_new)
# Setting New Index for data
data_new= data_new.set_index(i for i in range(0, len(data_new)))

Unnamed: 0,Label,News
1,0,More on Housing Prices : The broadest governme...
2,1,
3,0,Honoring R.W. Apple in Words and Food : About ...
4,0,"Homebuilders, and Worries Over Jobs, Lead a De..."
5,1,"Homebuilders, and Worries Over Jobs, Lead a De..."
...,...,...
2512,0,
2513,1,Fighting iOS Calendar Spam : Unsolicited invit...
2514,0,
2515,0,


In [5]:
# Finding Missing Values
miss_value_row_list=[]

for j in range(len(data_new)):
    if type(data_new["News"][j]) is str: # Non-missing values will have str type
        continue
    else: 
        print( 'Row ' + str(j))
        miss_value_row_list.append(j)
miss_value_row_list= list(set(miss_value_row_list)) # Removing repeating elements(row number) 
print('\n')
print("Row numbers with missing valus :" + str(miss_value_row_list))

Row 1
Row 6
Row 13
Row 42
Row 47
Row 57
Row 62
Row 77
Row 81
Row 83
Row 122
Row 128
Row 135
Row 141
Row 159
Row 168
Row 184
Row 213
Row 221
Row 254
Row 258
Row 265
Row 270
Row 278
Row 290
Row 297
Row 316
Row 320
Row 328
Row 351
Row 367
Row 372
Row 379
Row 393
Row 399
Row 417
Row 425
Row 434
Row 453
Row 463
Row 467
Row 472
Row 479
Row 482
Row 503
Row 511
Row 519
Row 521
Row 525
Row 531
Row 572
Row 584
Row 612
Row 618
Row 625
Row 633
Row 647
Row 652
Row 670
Row 682
Row 689
Row 696
Row 699
Row 703
Row 711
Row 731
Row 736
Row 749
Row 772
Row 775
Row 794
Row 834
Row 835
Row 857
Row 871
Row 899
Row 919
Row 942
Row 959
Row 972
Row 978
Row 987
Row 1027
Row 1035
Row 1043
Row 1047
Row 1053
Row 1065
Row 1078
Row 1088
Row 1095
Row 1107
Row 1111
Row 1121
Row 1124
Row 1130
Row 1145
Row 1151
Row 1161
Row 1168
Row 1172
Row 1179
Row 1185
Row 1195
Row 1201
Row 1208
Row 1214
Row 1222
Row 1226
Row 1231
Row 1237
Row 1243
Row 1274
Row 1296
Row 1298
Row 1301
Row 1328
Row 1380
Row 1404
Row 1430
Row 1448
Row 1

In [6]:
# Total Available Data:
len(data_new)-len(miss_value_row_list)

2322

In [7]:
# Removing rows with missing entries
data_new.drop(miss_value_row_list, inplace = True)

# Setting New Index for data
data_new= data_new.set_index(i for i in range(len(data_new)))

# NLP

In [8]:
# Cleanig all the news(stacked together)
def nlp_preprocess(text):
# corpus=[]
    news= re.sub('[^a-zA-Z]', ' ', text)
    news= news.lower()
    news= news.split()
    news=[word for word in news if not word in set(stopwords.words('english'))]
    pe = PorterStemmer()
    news=[pe.stem(word) for word in news if not word in set(stopwords.words('english'))]
    news= ' '.join(news)
    # corpus.append(news)
    return news

In [9]:
# Target Variable
Y= data_new["Label"].values
display(Y)

# Input Variable
txt= data_new["News"].values


#Spliting Dataset into Training and Test sets
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test= train_test_split(txt, Y, test_size=0.10, random_state=0)

array([0, 0, 0, ..., 1, 1, 1])

In [10]:
# Train data
train_corpus= []
for i in range(len(X_train)):
    train_corpus.append(nlp_preprocess(X_train[i]))
    
# Test data
test_corpus= []
for i in range(len(X_test)):
    test_corpus.append(nlp_preprocess(X_test[i]))

In [11]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=65000)

# Train data
trainX = cv.fit_transform(train_corpus).toarray()

# Test data
testX= cv.transform(test_corpus).toarray()

In [12]:
X_train= trainX
X_test= testX

In [13]:
# Saving the CV model
joblib.dump(cv, "Count_Vec_model.pkl")

['Count_Vec_model.pkl']

# Classification Algorithms

In [14]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier 
classifier= RandomForestClassifier(n_estimators= 500, criterion= 'entropy', random_state= 0) # Try with diffrent numbers of n_estimators(n_estimators= number of trees)
classifier.fit(X_train, Y_train)

# Predicting the test set result
y_pred= classifier.predict(X_test)
display(y_pred)

# Making the Confusion matrix
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(Y_test, y_pred)
display(cm)

Accuracy= (cm[0][0] + cm[1][1])/(cm[0][0] + cm[1][1] + cm[0][1] + cm[1][0])
display(Accuracy)

array([1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0])

array([[45, 76],
       [42, 70]])

0.49356223175965663

In [15]:
# Decision Tree  
from sklearn.tree import DecisionTreeClassifier
classifier= DecisionTreeClassifier(criterion= 'entropy', random_state= 0)
classifier.fit(X_train, Y_train)

# Predicting the test set result
y_pred= classifier.predict(X_test)
display(y_pred)

# Making the Confusion matrix
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(Y_test, y_pred)
display(cm)

Accuracy= (cm[0][0] + cm[1][1])/(cm[0][0] + cm[1][1] + cm[0][1] + cm[1][0])
display(Accuracy)

array([0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1])

array([[60, 61],
       [67, 45]])

0.45064377682403434

In [16]:
# Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
classifier = GradientBoostingClassifier()
classifier = classifier.fit(X_train, Y_train)

# Predicting the test set result
y_pred= classifier.predict(X_test)
display(y_pred)

# Making the Confusion matrix
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(Y_test, y_pred)
display(cm)

Accuracy= (cm[0][0] + cm[1][1])/(cm[0][0] + cm[1][1] + cm[0][1] + cm[1][0])
display(Accuracy)

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1])

array([[43, 78],
       [38, 74]])

0.5021459227467812

In [17]:
# Kernel SVM
from sklearn.svm import SVC 
classifier= SVC(kernel='rbf', random_state= 0)
classifier.fit(X_train, Y_train)

# Predicting the test set result
y_pred= classifier.predict(X_test)
display(y_pred)

# Making the Confusion matrix
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(Y_test, y_pred)
display(cm)

Accuracy= (cm[0][0] + cm[1][1])/(cm[0][0] + cm[1][1] + cm[0][1] + cm[1][0])
display(Accuracy)

array([0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1])

array([[46, 75],
       [42, 70]])

0.4978540772532189

In [18]:
# Multinomial NB
from sklearn.naive_bayes import MultinomialNB
classifier=MultinomialNB()
classifier.fit(X_train, Y_train)

# Predicting the test set result
y_pred= classifier.predict(X_test)
display(y_pred)

# Making the Confusion matrix
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(Y_test, y_pred)
display(cm)

Accuracy= (cm[0][0] + cm[1][1])/(cm[0][0] + cm[1][1] + cm[0][1] + cm[1][0])
display(Accuracy)

array([0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0])

array([[48, 73],
       [58, 54]])

0.43776824034334766

In [19]:
# SGDC
from sklearn.linear_model import SGDClassifier
classifier = SGDClassifier(loss='modified_huber', random_state=0, shuffle=True)
classifier = classifier.fit(X_train, Y_train)

# Predicting the test set result
y_pred= classifier.predict(X_test)
display(y_pred)

# Making the Confusion matrix
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(Y_test, y_pred)
display(cm)

Accuracy= (cm[0][0] + cm[1][1])/(cm[0][0] + cm[1][1] + cm[0][1] + cm[1][0])
display(Accuracy)

array([0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0])

array([[60, 61],
       [53, 59]])

0.5107296137339056

In [20]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
classifier= KNeighborsClassifier(n_neighbors= 7, metric='minkowski', p=2, leaf_size=70, weights= 'distance', algorithm= 'brute')
classifier.fit(X_train, Y_train)

# Predicting the test set result
y_pred= classifier.predict(X_test)
display(y_pred)

# Making the Confusion matrix
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(Y_test, y_pred)
display(cm)

Accuracy= (cm[0][0] + cm[1][1])/(cm[0][0] + cm[1][1] + cm[0][1] + cm[1][0])
display(Accuracy)

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

array([[107,  14],
       [ 92,  20]])

0.5450643776824035

In [21]:
# Saving the classifier for future use 
joblib.dump(classifier, "Apple_stock_behaviour.pkl", compress=1)

['Apple_stock_behaviour.pkl']