In [1]:
# Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import re
import nltk
nltk.download('stopwords') #downloads the required list of words that we need to remove
from nltk.corpus import stopwords # importing 'stopwords' to this notebook
from nltk.stem import WordNetLemmatizer # For Lemmitization

import joblib

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data Prepration

In [2]:
# 1st Data
data1= pd.read_csv('../input/stockmarket-sentiment-dataset/stock_data.csv')
data1.head(2)

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1


In [3]:
# Changing (-1) to (0)
for i in range(len(data1)):
    if data1["Sentiment"][i]== -1:
        data1["Sentiment"][i]= 0
display(data1.head())
display(data1.tail())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


Unnamed: 0,Text,Sentiment
5786,Industry body CII said #discoms are likely to ...,0
5787,"#Gold prices slip below Rs 46,000 as #investor...",0
5788,Workers at Bajaj Auto have agreed to a 10% wag...,1
5789,"#Sharemarket LIVE: Sensex off day’s high, up 6...",1
5790,"#Sensex, #Nifty climb off day's highs, still u...",1


In [4]:
# Checking Null values
data1.isnull().any()

Text         False
Sentiment    False
dtype: bool

In [5]:
# 2nd Data
data2= pd.read_csv("../input/sentiment-analysis-for-financial-news/all-data.csv", encoding='ISO-8859-1', header=None)
data2.columns =['Sentiment' ,'Text']
data2.head(2)

Unnamed: 0,Sentiment,Text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...


In [6]:
# Assigning label numbers
for i in range(len(data2)):
    if data2["Sentiment"][i]== "neutral":
        data2["Sentiment"][i]= 2
        
    elif data2["Sentiment"][i]== "negative":
        data2["Sentiment"][i]= 0
        
    elif data2["Sentiment"][i]== "positive":
        data2["Sentiment"][i]= 1
        
display(data2.head())
display(data2.tail())
data2= data2[["Text", "Sentiment"]]

Unnamed: 0,Sentiment,Text
0,2,"According to Gran , the company has no plans t..."
1,2,Technopolis plans to develop in stages an area...
2,0,The international electronic industry company ...
3,1,With the new production plant the company woul...
4,1,According to the company 's updated strategy f...


Unnamed: 0,Sentiment,Text
4841,0,LONDON MarketWatch -- Share prices ended lower...
4842,2,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,0,Operating profit fell to EUR 35.4 mn from EUR ...
4844,0,Net sales of the Paper segment decreased to EU...
4845,0,Sales in Finland decreased by 10.5 % in Januar...


In [7]:
# Joining the 2 datasets
data= pd.concat([data1, data2], axis=0, sort=False) # Joining date_lable and tops

# Setting New Index for data_new
data= data.set_index(i for i in range(0, len(data)))

display(data.head())
display(data.tail())

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


Unnamed: 0,Text,Sentiment
10632,LONDON MarketWatch -- Share prices ended lower...,0
10633,Rinkuskiai 's beer sales fell by 6.5 per cent ...,2
10634,Operating profit fell to EUR 35.4 mn from EUR ...,0
10635,Net sales of the Paper segment decreased to EU...,0
10636,Sales in Finland decreased by 10.5 % in Januar...,0


## To make the final classifier model suitable for new dataset(DJIA data), we will need to add those dataset text while creating corpus. Else it will give the following kind of error:
 **ValueError: Number of features of the model must match the input. Model n_features is 43 and input n_features is 2250** 
 
 ## But during the sentiment training process, we dont want the DJIA data text. And hence we will remove it after creating corpus(X_selected will not contain DJIA data information) 

In [8]:
# Copying the whole preprocessing from "DJIA News based performance"

data_DJIA= pd.read_csv('../input/stocknews/Combined_News_DJIA.csv')

# Making 1 day past news uselful for next day
date_label= data_DJIA[['Date', 'Label']]
tops= data_DJIA.drop(columns= ['Date', 'Label'])

date_label= date_label.loc[1:len(date_label)] # Removing 1st Date Entry
tops=tops.loc[0:len(date_label)-1] # Removing last News Entry

date_label= date_label.set_index(i for i in range(0, len(date_label))) # Setting Index for date_label
tops= tops.set_index(i for i in range(0, len(tops))) # Setting Index for tops

data_new= pd.concat([date_label, tops], axis=1, sort=False) # Joining date_lable and tops

# Finding Missing Values
miss_value_row_list=[]
for i in tops.columns:
    for j in range(len(data_new)):
        if type(data_new[str(i)][j]) is str: # Non-missing values will have str type
            continue
        else: 
            miss_value_row_list.append(j)
            
    miss_value_row_list= list(set(miss_value_row_list)) # Removing repeating elements(row number) 

# Removing rows with missing entries
data_new.drop(miss_value_row_list, inplace = True)

# Setting New Index for data_new
data_new= data_new.set_index(i for i in range(0, len(tops) - len(miss_value_row_list))) # After 

# Converting all the encoded news text(b'') to normal str 
for i in tops.columns:
    for j in range(len(data_new)):
        if data_new[str(i)][j][0]== 'b' and (data_new[str(i)][j][1]== '"' or data_new[str(i)][j][1]=="'"): # It is encoded text if entry: b'_' or b"_"
            data_new[str(i)][j]= data_new[str(i)][j][2:-1] # Removing b'' and b""
            
# Combining all test of Top1, Top2..., Top25 in date-wise order  
top25_news= []
for i in range(len(data_new)):
    news_list=[]
    for j in tops.columns:
        news= data_new[str(j)][i]
        news_list.append(news)
    news_list= ' '.join(news_list)
    top25_news.append(news_list)
    
# Adding new column to data_new with all comments stacked tohgether for a particular date
data_new.insert(2, "Text", top25_news) # As our sentiment data contains text in column named 'Text'

# Creating a new dataset(data_final) with just the required columns
data_all_news= data_new[["Date", "Label", "Text"]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [9]:
data_all_news["Text"].shape, data['Text'].shape

((1985,), (10637,))

**Hence we need to remove the 1985 elements from array**

In [10]:
for_corpus= pd.concat([data[['Text']], data_all_news[["Text"]]], axis=0, sort=False) # Enclosing [[ ]] means we are assigning it as pd dataframe, not pd series
for_corpus= for_corpus.set_index(i for i in range(0, len(data) + len(data_all_news)))
len(for_corpus) # should be 10637 + 1985= 12622

12622

In [11]:
for_corpus

Unnamed: 0,Text
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...
1,user: AAP MOVIE. 55% return for the FEA/GEED i...
2,user I'd be afraid to short AMZN - they are lo...
3,MNTA Over 12.00
4,OI Over 21.37
...,...
12617,David Cameron to Resign as PM After EU Referen...
12618,Barclays and RBS shares suspended from trading...
12619,"2,500 Scientists To Australia: If You Want To ..."
12620,Explosion At Airport In Istanbul Yemeni former...


# NLP

In [12]:
# Cleanig all the news(stacked together)
corpus=[]
for i in range(len(for_corpus)):
    news= re.sub('[^a-zA-Z]', ' ', for_corpus['Text'][i])
    news= news.lower()
    news= news.split()
    news=[word for word in news if not word in set(stopwords.words('english'))]
    lemmatizer = WordNetLemmatizer()
    news=[lemmatizer.lemmatize(word)for word in news if not word in set(stopwords.words('english'))]
    news= ' '.join(news)
    corpus.append(news)

In [13]:
# Creating the TF-IDF model
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df=0.02, max_df=0.175, max_features = 50000, ngram_range = (1, 1))
X = tfidf.fit_transform(corpus).toarray()

In [14]:
len(X)

12622

In [15]:
# Selecting only the sentiment analysis data, and removing the DJIA data
'''
    A the operation done while joining two dataframes was 
    pd.concat([data[['Text']], data_all_news[["Text"]]].
    This means the first 10637 entries were from sentiment
    analyss data and the last 1985 were from DJIA data.
    Hence we will slice the X and take first 10637 entries 
    for training and testing classifier.
'''
X_selected= X[:10637]
len(X_selected)==len(data) # This will return true if the array is sliced just for sentiment data

True

In [16]:
# Saving TF-IDF model for future
joblib.dump(tfidf, 'TFIDF_model.pkl')

['TFIDF_model.pkl']

In [17]:
# Target variable
Y= data["Sentiment"].values
Y= Y.astype('int')
display(Y)

array([1, 1, 1, ..., 0, 0, 0])

# Data Splitting

In [18]:
#Spliting Dataset into Training and Test sets
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test= train_test_split(X_selected, Y, test_size=0.10, random_state=0)

# Classification Algorithms

In [19]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier 
classifier= RandomForestClassifier(n_estimators= 500, criterion= 'entropy', random_state= 0) 
classifier.fit(X_train, Y_train)

# Predicting the test set result
y_pred= classifier.predict(X_test)
display(y_pred)

# Making the Confusion matrix
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(Y_test, y_pred)
display(cm)

Accuracy= (cm[0][0] + cm[1][1])/(cm[0][0] + cm[1][1] + cm[0][1] + cm[1][0])
display(Accuracy)

array([1, 2, 0, ..., 1, 1, 1])

array([[121, 107,  31],
       [ 72, 347, 100],
       [ 10,  73, 203]])

0.723338485316847

In [20]:
# Decision Tree  
from sklearn.tree import DecisionTreeClassifier
classifier= DecisionTreeClassifier(criterion= 'entropy', random_state= 0)
classifier.fit(X_train, Y_train)

# Predicting the test set result
y_pred= classifier.predict(X_test)
display(y_pred)

# Making the Confusion matrix
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(Y_test, y_pred)
display(cm)

Accuracy= (cm[0][0] + cm[1][1])/(cm[0][0] + cm[1][1] + cm[0][1] + cm[1][0])
display(Accuracy)

array([1, 2, 0, ..., 1, 1, 1])

array([[119, 112,  28],
       [101, 340,  78],
       [ 23,  98, 165]])

0.6830357142857143

In [21]:
# Kernel SVM
from sklearn.svm import SVC 
classifier= SVC(kernel='rbf', random_state= 0)
classifier.fit(X_train, Y_train)

# Predicting the test set result
y_pred= classifier.predict(X_test)
display(y_pred)

# Making the Confusion matrix
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(Y_test, y_pred)
display(cm)

Accuracy= (cm[0][0] + cm[1][1])/(cm[0][0] + cm[1][1] + cm[0][1] + cm[1][0])
display(Accuracy)

array([1, 2, 0, ..., 1, 1, 1])

array([[112, 118,  29],
       [ 55, 369,  95],
       [  6,  78, 202]])

0.735474006116208

In [22]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
classifier= KNeighborsClassifier(n_neighbors= 7, metric='minkowski', p=2, leaf_size=70, weights= 'distance', algorithm= 'brute')
classifier.fit(X_train, Y_train)

# Predicting the test set result
y_pred= classifier.predict(X_test)
display(y_pred)

# Making the Confusion matrix
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(Y_test, y_pred)
display(cm)

Accuracy= (cm[0][0] + cm[1][1])/(cm[0][0] + cm[1][1] + cm[0][1] + cm[1][0])
display(Accuracy)

array([1, 2, 0, ..., 0, 1, 0])

array([[119, 112,  28],
       [ 87, 355,  77],
       [ 23,  93, 170]])

0.7043090638930164

In [23]:
# Multinomial NB
from sklearn.naive_bayes import MultinomialNB
classifier=MultinomialNB()
classifier.fit(X_train, Y_train)

# Predicting the test set result
y_pred= classifier.predict(X_test)
display(y_pred)

# Making the Confusion matrix
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(Y_test, y_pred)
display(cm)

Accuracy= (cm[0][0] + cm[1][1])/(cm[0][0] + cm[1][1] + cm[0][1] + cm[1][0])
display(Accuracy)

array([1, 2, 1, ..., 1, 1, 1])

array([[ 75, 152,  32],
       [ 39, 384,  96],
       [  4, 103, 179]])

0.7061538461538461

In [24]:
# SGDC
from sklearn.linear_model import SGDClassifier
classifier = SGDClassifier(loss='modified_huber', random_state=0, shuffle=True)
classifier = classifier.fit(X_train, Y_train)

# Predicting the test set result
y_pred= classifier.predict(X_test)
display(y_pred)

# Making the Confusion matrix
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(Y_test, y_pred)
display(cm)

Accuracy= (cm[0][0] + cm[1][1])/(cm[0][0] + cm[1][1] + cm[0][1] + cm[1][0])
display(Accuracy)

array([0, 2, 1, ..., 1, 1, 1])

array([[ 92, 128,  39],
       [ 51, 370,  98],
       [  2,  86, 198]])

0.7207488299531981

In [25]:
# Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
classifier = GradientBoostingClassifier()
classifier = classifier.fit(X_train, Y_train)

# Predicting the test set result
y_pred= classifier.predict(X_test)
display(y_pred)

# Making the Confusion matrix
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(Y_test, y_pred)
display(cm)

Accuracy= (cm[0][0] + cm[1][1])/(cm[0][0] + cm[1][1] + cm[0][1] + cm[1][0])
display(Accuracy)

array([1, 2, 0, ..., 1, 1, 1])

array([[109, 131,  19],
       [ 52, 406,  61],
       [  0, 158, 128]])

0.7378223495702005

 Gradient Boosting Classifier gave the best test accuracy

In [26]:
# Saving the GradientBoostingClassifier for future
joblib.dump(classifier, "txt_sentiment.pkl")

['txt_sentiment.pkl']

# Predicting Class Probablities

In [27]:
# for probabilities
l= classifier.predict_proba(X_test)

In [28]:
l[1]

array([0.10366351, 0.23244337, 0.66389312])

**Classes:-** 0,1,2 i.e (negative), (positive), (nutral)

In [29]:
l_1= classifier.predict_proba(X_test[1].reshape(1, -1))
display(l_1)

array([[0.10366351, 0.23244337, 0.66389312]])

In [30]:
l_1==l[1]

array([[ True,  True,  True]])

In [31]:
k= classifier.predict_proba(X_test[1].reshape(1, -1))
k[0][0]

0.10366351126920154