In [43]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer

# Data import and preprocessing

In [44]:
TeweetData = pd.read_csv(r'./stock_data.csv')
TeweetData=TeweetData.rename(columns={"Text": "text", "Sentiment": "data_sentiment"})

In [45]:
TeweetData.head()

Unnamed: 0,text,data_sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [46]:
TeweetDataSubset = TeweetData[["text","data_sentiment"]]

In [47]:
TeweetDataSubset.head()

Unnamed: 0,text,data_sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [48]:
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/raphael/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [49]:
stop_words = set(stopwords.words("english"))
print(stop_words)

{'most', 'again', 'our', 'whom', 'through', "she's", 'hers', 'same', 'during', "hasn't", "weren't", 'mightn', 'a', 'y', 'against', 'and', "don't", 'up', 'own', "it's", 'them', 'isn', 'her', 'wouldn', 'didn', 'why', 'they', "mightn't", 'below', 'more', 'your', "you'd", 'don', "wasn't", "won't", 'down', 'but', 'not', 'these', 'were', 'itself', 'has', 'won', 'there', "hadn't", 'can', "should've", 'from', 'weren', 'before', 'she', 'being', 'will', 'hadn', 'now', 'yours', 'until', 'hasn', 'or', "you've", 'of', 'that', "you're", 'd', 'was', 'his', 'we', 's', 'few', 'when', 'been', 'himself', 'haven', 'what', 'doesn', 'after', 'while', 'mustn', 'do', 'ma', 'so', 'to', 've', 'm', 'where', 'herself', 'the', "mustn't", 'some', "isn't", 'here', 'myself', 'had', "needn't", 'just', 'their', "that'll", 'as', 'shouldn', 'under', 'no', "doesn't", 'be', 'doing', 'its', 'above', 'between', 'only', 'over', 'once', 'this', 'did', 'those', 'into', 'on', "shan't", 'have', 're', 'themselves', "shouldn't", 'o

In [50]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [51]:
wordnet = WordNetLemmatizer()
def text_preproc(x):
  x = x.lower()
  x = ' '.join([word for word in x.split(' ') if word not in stop_words])
  x = x.encode('ascii', 'ignore').decode()
  x = re.sub(r'https*\S+', ' ', x)
  x = re.sub(r'@\S+', ' ', x)
  x = re.sub(r'#\S+', ' ', x)
  x = re.sub(r'\'\w+', '', x)
  x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x)
  x = re.sub(r'\w*\d+\w*', '', x)
  x = re.sub(r'\s{2,}', ' ', x)
  return x

In [52]:
TeweetDataSubset.head(10)

Unnamed: 0,text,data_sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1
5,PGNX Over 3.04,1
6,AAP - user if so then the current downtrend wi...,-1
7,Monday's relative weakness. NYX WIN TIE TAP IC...,-1
8,GOOG - ower trend line channel test & volume s...,1
9,AAP will watch tomorrow for ONG entry.,1


In [53]:
TeweetDataSubset['clean_text'] = TeweetDataSubset.text.apply(text_preproc)

In [54]:
TeweetDataSubset.head()

Unnamed: 0,text,data_sentiment,clean_text
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1,kickers watchlist xide tit soq pnk cpw bpz aj ...
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1,user aap movie return fea geed indicator trade...
2,user I'd be afraid to short AMZN - they are lo...,1,user i afraid short amzn looking like near mon...
3,MNTA Over 12.00,1,mnta
4,OI Over 21.37,1,oi


In [55]:
#TeweetDataSubset.data_sentiment = TeweetDataSubset.data_sentiment.map( {'positive':1 , 'negative':0} )
# TeweetDataSubset.data_sentiment = TeweetDataSubset.data_sentiment.map( {'1':1 , '-1':0} )
# TeweetDataSubset.replace({'a': None})
TeweetDataSubset=TeweetDataSubset.replace([-1],0)

# Basic ML model

In [56]:
#SPLITTING THE TRAINING DATASET INTO TRAIN AND TEST
X_train, X_test, y_train, y_test = train_test_split(TeweetDataSubset["clean_text"],TeweetDataSubset["data_sentiment"],test_size=0.1,shuffle=True)

In [57]:
#Tf-Idf
tfidf_vectorizer = TfidfVectorizer(use_idf=True,max_features=10)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test)

In [58]:
X_train_vectors_tfidf

<5211x10 sparse matrix of type '<class 'numpy.float64'>'
	with 3428 stored elements in Compressed Sparse Row format>

In [59]:
y_train

96      1
2385    1
3961    0
5021    0
402     0
       ..
802     1
2658    0
5       1
1486    0
4248    0
Name: data_sentiment, Length: 5211, dtype: int64

In [60]:
lr_tfidf=MultinomialNB()
lr_tfidf.fit(X_train_vectors_tfidf, y_train)

#Predict y value for test dataset
y_predict = lr_tfidf.predict(X_test_vectors_tfidf)
y_prob = lr_tfidf.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))


              precision    recall  f1-score   support

           0       0.80      0.17      0.28       221
           1       0.66      0.97      0.78       359

    accuracy                           0.67       580
   macro avg       0.73      0.57      0.53       580
weighted avg       0.71      0.67      0.59       580

Confusion Matrix: [[ 37 184]
 [  9 350]]


# HYBRID model

In [61]:
#Adding regular expression to data
import re
from collections import Counter
Counter(" ".join(TeweetDataSubset['clean_text']).split()).most_common(100)

[('aap', 928),
 ('user', 646),
 ('short', 456),
 ('today', 343),
 ('day', 311),
 ('volume', 307),
 ('like', 278),
 ('long', 268),
 ('good', 230),
 ('stock', 227),
 ('goog', 212),
 ('watch', 208),
 ('new', 208),
 ('bac', 205),
 ('stop', 202),
 ('still', 199),
 ('nice', 196),
 ('back', 191),
 ('buy', 186),
 ('next', 183),
 ('move', 180),
 ('coronavirus', 178),
 ('market', 176),
 ('higher', 176),
 ('time', 174),
 ('see', 162),
 ('one', 157),
 ('ong', 156),
 ('week', 156),
 ('triangle', 148),
 ('trade', 143),
 ('close', 143),
 ('weekly', 137),
 ('stocks', 137),
 ('could', 134),
 ('here', 134),
 ('break', 133),
 ('looking', 131),
 ('big', 128),
 ('breakout', 127),
 ('support', 125),
 ('go', 123),
 ('sensex', 123),
 ('going', 120),
 ('i', 119),
 ('bullish', 119),
 ('nfx', 118),
 ('last', 117),
 ('looks', 115),
 ('target', 114),
 ('green', 113),
 ('points', 113),
 ('nifty', 113),
 ('highs', 112),
 ('lower', 112),
 ('price', 111),
 ('earnings', 111),
 ('markets', 111),
 ('high', 110),
 ('amzn'

In [62]:
Postive_pattern_1 = r"thanks"
Postive_pattern_2 = r"thank"
Postive_pattern_3 = r"great"
Postive_pattern_4 = r"good"
Postive_pattern_5 = r"safe"
Postive_pattern_6 = r"awesome"
Postive_pattern_7 = r"new"
Postive_pattern_8 = r"satisfied"
Postive_pattern_8 = r"high"
Postive_pattern_8 = r"fire"



Positive_Pattern_List = [Postive_pattern_1,Postive_pattern_2,Postive_pattern_3,Postive_pattern_4,
                        Postive_pattern_5,Postive_pattern_6,Postive_pattern_7,Postive_pattern_8]

In [63]:
Positive_Complex_Pattern = re.compile('|'.join(['(%s)' % i for i in Positive_Pattern_List]),re.IGNORECASE)

In [64]:
Negative_pattern_1 = r"cancelled"
Negative_pattern_2 = r"delayed"
Negative_pattern_3 = r"trying"
Negative_pattern_4 = r"please"
Negative_pattern_5 = r"wait"
Negative_pattern_6 = r"worst"
Negative_pattern_7 = r"lost"
Negative_pattern_8 = r"never"
Negative_pattern_9 = r"fraud trans"

Negative_Pattern_List = [Negative_pattern_1,Negative_pattern_2,Negative_pattern_3,Negative_pattern_4,
                        Negative_pattern_5,Negative_pattern_6,Negative_pattern_7,Negative_pattern_8]

In [65]:
Negative_Complex_Pattern = re.compile('|'.join(['(%s)' % i for i in Negative_Pattern_List]),re.IGNORECASE)

In [66]:
TeweetDataSubset["Negative_Sentiment_Flag"] = TeweetDataSubset["clean_text"].apply(lambda x:1 if(len(re.findall(Negative_Complex_Pattern,x))>0) else 0)

In [67]:
TeweetDataSubset["Positive_Sentiment_Flag"] = TeweetDataSubset["clean_text"].apply(lambda x:1 if(len(re.findall(Positive_Complex_Pattern,x))>0) else 0)

In [68]:
TeweetDataSubset.head(20)

Unnamed: 0,text,data_sentiment,clean_text,Negative_Sentiment_Flag,Positive_Sentiment_Flag
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1,kickers watchlist xide tit soq pnk cpw bpz aj ...,0,0
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1,user aap movie return fea geed indicator trade...,0,1
2,user I'd be afraid to short AMZN - they are lo...,1,user i afraid short amzn looking like near mon...,0,0
3,MNTA Over 12.00,1,mnta,0,0
4,OI Over 21.37,1,oi,0,0
5,PGNX Over 3.04,1,pgnx,0,0
6,AAP - user if so then the current downtrend wi...,0,aap user current downtrend break otherwise sho...,0,0
7,Monday's relative weakness. NYX WIN TIE TAP IC...,0,monday relative weakness nyx win tie tap ice i...,0,0
8,GOOG - ower trend line channel test & volume s...,1,goog ower trend line channel test volume support,0,0
9,AAP will watch tomorrow for ONG entry.,1,aap watch tomorrow ong entry,0,0


In [69]:
#SPLITTING THE TRAINING DATASET INTO TRAIN AND TEST
X_train, X_test, y_train, y_test = train_test_split(TeweetDataSubset[["clean_text","Negative_Sentiment_Flag","Positive_Sentiment_Flag"]],TeweetDataSubset["data_sentiment"],test_size=0.1,shuffle=True)

In [70]:
#Tf-Idf
tfidf_vectorizer = TfidfVectorizer(use_idf=True,max_features = 10)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train["clean_text"])
X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test["clean_text"])

In [71]:
mydf = pd.DataFrame(X_train_vectors_tfidf.toarray())
mydf_test = pd.DataFrame(X_test_vectors_tfidf.toarray())

In [72]:
flag_columns_train = X_train[["Negative_Sentiment_Flag","Positive_Sentiment_Flag"]]
flag_columns_test = X_test[["Negative_Sentiment_Flag","Positive_Sentiment_Flag"]]

In [73]:
X_train_vectors_tfidf_combined_with_flag = pd.concat([mydf.reset_index(drop=True),flag_columns_train.reset_index(drop=True)],axis=1)

In [74]:
X_test_vectors_tfidf_combined_with_flag = pd.concat([mydf_test.reset_index(drop=True),flag_columns_test.reset_index(drop=True)],axis=1)

In [75]:
X_train_vectors_tfidf_combined_with_flag.columns = X_train_vectors_tfidf_combined_with_flag.columns.astype(str)
# y_train=y_train.astype(str)

In [76]:
X_test_vectors_tfidf_combined_with_flag.columns = X_test_vectors_tfidf_combined_with_flag.columns.astype(str)

In [77]:
lr_tfidf=MultinomialNB()
lr_tfidf.fit(X_train_vectors_tfidf_combined_with_flag, y_train)

#Predict y value for test dataset
y_predict = lr_tfidf.predict(X_test_vectors_tfidf_combined_with_flag)
y_prob = lr_tfidf.predict_proba(X_test_vectors_tfidf_combined_with_flag)[:,1]
print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.80      0.12      0.20       205
           1       0.67      0.98      0.80       375

    accuracy                           0.68       580
   macro avg       0.74      0.55      0.50       580
weighted avg       0.72      0.68      0.59       580

Confusion Matrix: [[ 24 181]
 [  6 369]]
