In [1]:
# Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import re
import nltk
from nltk.corpus import stopwords # importing 'stopwords' to this notebook
from nltk.stem.porter import PorterStemmer ## stemming of words

import joblib

# Data Prepration

In [2]:
# 1st Data
data1= pd.read_csv('../input/stockmarket-sentiment-dataset/stock_data.csv')
data1.head(2)

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1


In [3]:
# Changing (-1) to (0)
for i in range(len(data1)):
    if data1["Sentiment"][i]== -1:
        data1["Sentiment"][i]= 0
display(data1.head())
display(data1.tail())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


Unnamed: 0,Text,Sentiment
5786,Industry body CII said #discoms are likely to ...,0
5787,"#Gold prices slip below Rs 46,000 as #investor...",0
5788,Workers at Bajaj Auto have agreed to a 10% wag...,1
5789,"#Sharemarket LIVE: Sensex off day’s high, up 6...",1
5790,"#Sensex, #Nifty climb off day's highs, still u...",1


In [4]:
# Checking Null values
data1.isnull().any()

Text         False
Sentiment    False
dtype: bool

In [5]:
# 2nd Data
data2= pd.read_csv("../input/sentiment-analysis-for-financial-news/all-data.csv", encoding='ISO-8859-1', header=None)
data2.columns =['Sentiment' ,'Text']
data2.head(2)

Unnamed: 0,Sentiment,Text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...


In [6]:
# Assigning label numbers
for i in range(len(data2)):
    if data2["Sentiment"][i]== "neutral":
        data2["Sentiment"][i]= 2
        
    elif data2["Sentiment"][i]== "negative":
        data2["Sentiment"][i]= 0
        
    elif data2["Sentiment"][i]== "positive":
        data2["Sentiment"][i]= 1
        
display(data2.head())
display(data2.tail())
data2= data2[["Text", "Sentiment"]]

Unnamed: 0,Sentiment,Text
0,2,"According to Gran , the company has no plans t..."
1,2,Technopolis plans to develop in stages an area...
2,0,The international electronic industry company ...
3,1,With the new production plant the company woul...
4,1,According to the company 's updated strategy f...


Unnamed: 0,Sentiment,Text
4841,0,LONDON MarketWatch -- Share prices ended lower...
4842,2,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,0,Operating profit fell to EUR 35.4 mn from EUR ...
4844,0,Net sales of the Paper segment decreased to EU...
4845,0,Sales in Finland decreased by 10.5 % in Januar...


In [7]:
# Joining the 2 datasets
data= pd.concat([data1, data2], axis=0, sort=False) # Joining date_lable and tops

# Setting New Index for data_new
data= data.set_index(i for i in range(0, len(data)))

display(data.head())
display(data.tail())

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


Unnamed: 0,Text,Sentiment
10632,LONDON MarketWatch -- Share prices ended lower...,0
10633,Rinkuskiai 's beer sales fell by 6.5 per cent ...,2
10634,Operating profit fell to EUR 35.4 mn from EUR ...,0
10635,Net sales of the Paper segment decreased to EU...,0
10636,Sales in Finland decreased by 10.5 % in Januar...,0


# NLP

In [8]:
# Cleanig all the news(stacked together)
def nlp_preprocess(text):
# corpus=[]
    news= re.sub('[^a-zA-Z]', ' ', text)
    news= news.lower()
    news= news.split()
    news=[word for word in news if not word in set(stopwords.words('english'))]
    pe = PorterStemmer()
    news=[pe.stem(word) for word in news if not word in set(stopwords.words('english'))]
    news= ' '.join(news)
    # corpus.append(news)
    return news

In [9]:
# Target Variable
Y= data["Sentiment"].values
Y= Y.astype('int')
display(Y)

# Input Variable
txt= data["Text"].values

array([1, 1, 1, ..., 0, 0, 0])

# Data Splitting

In [10]:
#Spliting Dataset into Training and Test sets
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test= train_test_split(txt, Y, test_size=0.10, random_state=0)

In [11]:
# Train data
train_corpus= []
for i in range(len(X_train)):
    train_corpus.append(nlp_preprocess(X_train[i]))
    
# Test data
test_corpus= []
for i in range(len(X_test)):
    test_corpus.append(nlp_preprocess(X_test[i]))

In [12]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=50000)

# Train data
trainX = cv.fit_transform(train_corpus).toarray()

# Test data
testX= cv.transform(test_corpus).toarray()

In [13]:
X_train= trainX
X_test= testX

In [14]:
# Saving TF-IDF model for future
joblib.dump(cv, 'cv_model.pkl')

['cv_model.pkl']

# Classification Algorithms

In [15]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier 
classifier= RandomForestClassifier(n_estimators= 500, criterion= 'entropy', random_state= 0) 
classifier.fit(X_train, Y_train)

# Predicting the test set result
y_pred= classifier.predict(X_test)
display(y_pred)

# Making the Confusion matrix
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(Y_test, y_pred)
display(cm)

Accuracy= (cm[0][0] + cm[1][1])/(cm[0][0] + cm[1][1] + cm[0][1] + cm[1][0])
display(Accuracy)

array([1, 2, 1, ..., 0, 1, 1])

array([[149,  83,  27],
       [ 51, 376,  92],
       [  3,  43, 240]])

0.7966616084977238

In [16]:
# Decision Tree  
from sklearn.tree import DecisionTreeClassifier
classifier= DecisionTreeClassifier(criterion= 'entropy', random_state= 0)
classifier.fit(X_train, Y_train)

# Predicting the test set result
y_pred= classifier.predict(X_test)
display(y_pred)

# Making the Confusion matrix
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(Y_test, y_pred)
display(cm)

Accuracy= (cm[0][0] + cm[1][1])/(cm[0][0] + cm[1][1] + cm[0][1] + cm[1][0])
display(Accuracy)

array([1, 2, 1, ..., 0, 1, 1])

array([[150,  87,  22],
       [ 70, 373,  76],
       [ 17,  61, 208]])

0.7691176470588236

In [17]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
classifier= KNeighborsClassifier(n_neighbors= 7, metric='minkowski', p=2, leaf_size=70, weights= 'distance', algorithm= 'brute')
classifier.fit(X_train, Y_train)

# Predicting the test set result
y_pred= classifier.predict(X_test)
display(y_pred)

# Making the Confusion matrix
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(Y_test, y_pred)
display(cm)

Accuracy= (cm[0][0] + cm[1][1])/(cm[0][0] + cm[1][1] + cm[0][1] + cm[1][0])
display(Accuracy)

array([1, 2, 1, ..., 0, 1, 1])

array([[ 89, 165,   5],
       [ 40, 451,  28],
       [  4, 227,  55]])

0.7248322147651006

In [18]:
# Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
classifier = GradientBoostingClassifier()
classifier = classifier.fit(X_train, Y_train)

# Predicting the test set result
y_pred= classifier.predict(X_test)
display(y_pred)

# Making the Confusion matrix
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(Y_test, y_pred)
display(cm)

Accuracy= (cm[0][0] + cm[1][1])/(cm[0][0] + cm[1][1] + cm[0][1] + cm[1][0])
display(Accuracy)

array([1, 2, 0, ..., 0, 1, 1])

array([[127, 115,  17],
       [ 44, 418,  57],
       [  2, 135, 149]])

0.7741477272727273

In [19]:
# SGDC
from sklearn.linear_model import SGDClassifier
classifier = SGDClassifier(loss='modified_huber', random_state=0, shuffle=True)
classifier = classifier.fit(X_train, Y_train)

# Predicting the test set result
y_pred= classifier.predict(X_test)
display(y_pred)

# Making the Confusion matrix
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(Y_test, y_pred)
display(cm)

Accuracy= (cm[0][0] + cm[1][1])/(cm[0][0] + cm[1][1] + cm[0][1] + cm[1][0])
display(Accuracy)

array([1, 2, 1, ..., 0, 1, 0])

array([[172,  72,  15],
       [ 71, 387,  61],
       [ 22,  52, 212]])

0.7962962962962963

In [20]:
# Multinomial NB
from sklearn.naive_bayes import MultinomialNB
classifier=MultinomialNB()
classifier.fit(X_train, Y_train)

# Predicting the test set result
y_pred= classifier.predict(X_test)
display(y_pred)

# Making the Confusion matrix
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(Y_test, y_pred)
display(cm)

Accuracy= (cm[0][0] + cm[1][1])/(cm[0][0] + cm[1][1] + cm[0][1] + cm[1][0])
display(Accuracy)

array([1, 2, 1, ..., 0, 1, 1])

array([[155,  79,  25],
       [ 53, 368,  98],
       [ 10,  30, 246]])

0.7984732824427481

Multinomial NB Classifier gave the best test accuracy

In [21]:
# Saving the GradientBoostingClassifier for future
joblib.dump(classifier, "Txt_sentiment.pkl")

['Txt_sentiment.pkl']

# Predicting Class Probablities

In [22]:
# for probabilities
l= classifier.predict_proba(X_test)

In [23]:
l[1]

array([8.84129130e-05, 2.15854840e-03, 9.97753039e-01])

**Classes:-** 0,1,2 i.e (negative), (positive), (nutral)

In [24]:
l_1= classifier.predict_proba(X_test[1].reshape(1, -1))
display(l_1)

array([[8.84129130e-05, 2.15854840e-03, 9.97753039e-01]])

In [25]:
l_1==l[1]

array([[False, False,  True]])

In [26]:
k= classifier.predict_proba(X_test[1].reshape(1, -1))
k[0][0]

8.841291303043169e-05