**Importing Libraries**

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize, WordNetLemmatizer
import nltk
import re 
nltk.download('wordnet')
import pandas as pd
import numpy as np

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Kaggel Competition/Stock-Market Sentiment Dataset/stock_data.csv')

In [None]:
data.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


As the sentiment values are in 1 and -1, we will change those to 1 and 0 respectively.

In [None]:
sentiment = []
for ele in data['Sentiment']:
  if ele == -1:
    sentiment.append(0)
  else:
    sentiment.append(1)

In [None]:
data['Sentiment'] =  sentiment

In [None]:
data['Text'] = data.iloc[0:,0].str.lower()

In [None]:
## NlP Processing
from nltk.corpus import stopwords
ps = PorterStemmer()
lemma = WordNetLemmatizer()
stopwordSet = set(stopwords.words("english"))

In [None]:
## Clean the text 
text_reviews = list()
for i in range(len(data)):
    # Removing the URL links
    text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', data['Text'][i])
    # Repalcing anything apart from alphabets with space
    text = re.sub('[^a-zA-Z]'," ",text)
    text = text.lower()
    text = word_tokenize(text,language="english")
    text = [lemma.lemmatize(word) for word in text if(word) not in stopwordSet]
    text = " ".join(text)
    text_reviews.append(text)

In [None]:
data.head()

Unnamed: 0,Text,Sentiment
0,kickers on my watchlist xide tit soq pnk cpw b...,1
1,user: aap movie. 55% return for the fea/geed i...,1
2,user i'd be afraid to short amzn - they are lo...,1
3,mnta over 12.00,1
4,oi over 21.37,1


In [None]:
data.shape

(5791, 2)

# Applying naive bayes using Bag Of Words

In [None]:
## Create the (B.O.W) bag of word model
cv = CountVectorizer(max_features=3000)
X = cv.fit_transform(text_reviews).toarray()
y= data['Sentiment']

In [None]:
data.tail()

Unnamed: 0,Text,Sentiment
5786,industry body cii said #discoms are likely to ...,0
5787,"#gold prices slip below rs 46,000 as #investor...",0
5788,workers at bajaj auto have agreed to a 10% wag...,1
5789,"#sharemarket live: sensex off day’s high, up 6...",1
5790,"#sensex, #nifty climb off day's highs, still u...",1


In [None]:
## Split the dataset into Training and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state = 0)

In [None]:
## Naives baye multinomial
clf = MultinomialNB()
clf.fit(X_train, y_train)
Y_pred = clf.predict(X_test)

In [None]:
print(classification_report(y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.71      0.72      0.72       417
           1       0.84      0.83      0.84       742

    accuracy                           0.79      1159
   macro avg       0.78      0.78      0.78      1159
weighted avg       0.79      0.79      0.79      1159



# Performing hyperparameter tuning in naive bayes

In [None]:
previous_score=0
from sklearn import metrics

for alpha in np.arange(0,1,0.1):
    sub_classifier=MultinomialNB(alpha=alpha)
    sub_classifier.fit(X_train,y_train)
    y_pred=sub_classifier.predict(X_test)
    score = metrics.accuracy_score(y_test, y_pred)
    if score>previous_score:
        classifier=sub_classifier
    print("Alpha: {}, Score : {}".format(alpha,score))

  'setting alpha = %.1e' % _ALPHA_MIN)


Alpha: 0.0, Score : 0.7437446074201898
Alpha: 0.1, Score : 0.7756686798964625
Alpha: 0.2, Score : 0.7808455565142364
Alpha: 0.30000000000000004, Score : 0.7877480586712683
Alpha: 0.4, Score : 0.7868852459016393
Alpha: 0.5, Score : 0.7929249352890423
Alpha: 0.6000000000000001, Score : 0.7929249352890423
Alpha: 0.7000000000000001, Score : 0.7929249352890423
Alpha: 0.8, Score : 0.7920621225194133
Alpha: 0.9, Score : 0.7903364969801553


# Applying random forest using Bag Of Words

In [None]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
Y_pred = random_forest.predict(X_test)

In [None]:
print("accuracy: {}".format(accuracy_score(y_test, Y_pred)))

accuracy: 0.7886108714408974


# Applying naive bayes using TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Implementing Bag of Words
tfvector = TfidfVectorizer(ngram_range = (2,2))
train_dataset = tfvector.fit_transform(text_reviews)
train_dataset.shape

(5791, 34264)

In [None]:
## Split the dataset into Training and Test set
X_train, X_test, y_train, y_test = train_test_split(train_dataset, y , test_size=0.2, random_state = 0)

In [None]:
## Naives baye multinomial
clf = MultinomialNB()
clf.fit(X_train, y_train)
Y_pred = clf.predict(X_test)

In [None]:
print(classification_report(y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.87      0.18      0.29       417
           1       0.68      0.99      0.80       742

    accuracy                           0.69      1159
   macro avg       0.77      0.58      0.55      1159
weighted avg       0.75      0.69      0.62      1159



In [None]:
previous_score=0
from sklearn import metrics

for alpha in np.arange(0,1,0.1):
    sub_classifier=MultinomialNB(alpha=alpha)
    sub_classifier.fit(X_train,y_train)
    y_pred=sub_classifier.predict(X_test)
    score = metrics.accuracy_score(y_test, y_pred)
    if score>previous_score:
        classifier=sub_classifier
    print("Alpha: {}, Score : {}".format(alpha,score))

Alpha: 0.0, Score : 0.6229508196721312
Alpha: 0.1, Score : 0.6324417601380501
Alpha: 0.2, Score : 0.6798964624676446
Alpha: 0.30000000000000004, Score : 0.7213114754098361
Alpha: 0.4, Score : 0.734253666954271
Alpha: 0.5, Score : 0.728213977566868
Alpha: 0.6000000000000001, Score : 0.72648835202761
Alpha: 0.7000000000000001, Score : 0.7161345987920621
Alpha: 0.8, Score : 0.7100949094046591
Alpha: 0.9, Score : 0.6980155306298533


  'setting alpha = %.1e' % _ALPHA_MIN)


# Embedded Layer and LSTM 

In [None]:
from tensorflow.keras.preprocessing.text import one_hot
### Vocabulary size
voc_size=10000
onehot_repr = [one_hot(words,voc_size)for words in text_reviews] 

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [None]:
sent_length = 1000
embedded_docs = pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)

In [None]:
## Split the dataset into Training and Test set
X_train, X_test, y_train, y_test = train_test_split(embedded_docs, data['Sentiment'].values , test_size=0.2, random_state = 0)

In [None]:
import keras
from keras import layers

In [None]:
model = keras.models.Sequential()
model.add(layers.Embedding(100000, 64, input_length=sent_length))
model.add(layers.LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, validation_data=(X_test,y_test),epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f087bfdf668>

# Bi - Directional LSTM

In [None]:
## Creating model
from tensorflow.keras.layers import Bidirectional,LSTM,Dropout,Dense
embedding_vector_features=40
model1=Sequential()
model1.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model1.add(Bidirectional(LSTM(500,return_sequences=True)))
model1.add(Dropout(0.3))
model1.add(Bidirectional(LSTM(250,return_sequences=True)))
model1.add(Dropout(0.3))
model1.add(Bidirectional(LSTM(100,return_sequences=True)))
model1.add(Dropout(0.3))
model1.add(Bidirectional(LSTM(50)))
model1.add(Dropout(0.3))

model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model1.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1000, 40)          400000    
_________________________________________________________________
bidirectional (Bidirectional (None, 1000, 1000)        2164000   
_________________________________________________________________
dropout_2 (Dropout)          (None, 1000, 1000)        0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 1000, 500)         2502000   
_________________________________________________________________
dropout_3 (Dropout)          (None, 1000, 500)         0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 1000, 200)         480800    
_________________________________________________________________
dropout_4 (Dropout)          (None, 1000, 200)        

In [None]:
### Finally Training
model1.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=25,batch_size=64)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7f5ed47c2e10>

In [None]:
model1.evaluate(X_test, y_test)



[1.4510154724121094, 0.7342536449432373]