In [1]:
import pandas as pd 
import nltk
import numpy as np

from nltk.sentiment.vader import SentimentIntensityAnalyzer 
import matplotlib.pyplot as plt 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Dense, GlobalMaxPooling1D, Embedding

from tensorflow.keras.models import Model

In [2]:
df_aapl = pd.read_csv("stock_datasets/df_AAPL.csv")
# df_googl = pd.read_csv("stock_datasets/df_stocktwits_googl.csv")
# df_ma = pd.read_csv("stock_datasets/df_stocktwits_ma.csv")
# df_amzn = pd.read_csv("stock_datasets/df_stocktwits_amzn.csv")
# df_jnj = pd.read_csv("stock_datasets/df_stocktwits_jnj.csv")

def clean_columns(df):
    
     # remove extra column 
    df.drop('Unnamed: 0', axis=1, inplace = True)  
    df['Text_Cleaned'].replace("[^a-zA-Z]"," ", regex=True, inplace=True) 
    df['Text_Cleaned'].replace("[\d.]", "", regex=True, inplace=True)
    df['Text_Cleaned'] = df['Text_Cleaned'].str.strip()
    return df
# df_list = [df_aapl]#, df_googl,df_ma, df_amzn, df_jnj]

# for df in df_list: 
#     df= clean_columns(df)

df_aapl = clean_columns(df_aapl)

In [3]:
df_aapl

Unnamed: 0,date_time,tweet,Text_Cleaned
0,2011-11-15,RT howardlindzon: Looks like Goldman $gs is t...,rt howardlindzon looks like goldman gs is tryi...
1,2011-11-15,$AAPL http://stks.co/17zl (Weekly Chart) Appro...,aapl
2,2011-11-15,$AAPL down -8.26% this morning? That is a real...,aapl down negative percent this morning t...
3,2011-11-15,RT Zguy: $AAPL down -8.26% this morning? That ...,rt zguy aapl down negative percent this m...
4,2011-11-15,NEW POST: FROZEN TURKEYS http://stks.co/181s $...,new post frozen turkeys
...,...,...,...
190970,2023-01-04,"Dan Niles: In summary, my 2 overarching invest...",dan niles in summary my overarching investme...
190971,2023-01-04,$AAPL,aapl
190972,2023-01-04,$AAPL $MSFT $GOOGL $AMZN\nI will buy more and ...,aapl msft googl amzn i will buy more and sleep...
190973,2023-01-04,"$AAPL bye apple, hello meta",aapl bye apple hello meta


In [4]:
# preparing vader sentiment analyser 
vader = SentimentIntensityAnalyzer()

f = lambda title: vader.polarity_scores(title)['compound']
df_aapl['compound'] = df_aapl['tweet'].apply(f)
#df_aapl['date_time'] = pd.to_datetime(df_aapl.date_time).dt.date



In [5]:
# plt.figure(figsize=(10,10))
# mean_df = df_aapl.groupby([ 'date_time']).mean().unstack()
# mean_df = mean_df.xs('compound')
# mean_df.plot(kind='bar')
# plt.show()

In [6]:
df = pd.DataFrame({"sentiment": df_aapl['compound'], 
                   "data": df_aapl['Text_Cleaned']})
df['data'] = df['data'].astype(str)
df['data'] = df['data'].str.strip()

df_train, df_test, y_train, y_test = train_test_split(df['data'],df['sentiment'],test_size=0.33,random_state=42)

print("df train shape : ", df_train.shape)
print("df test shape : ", df_test.shape)
print("y train shape : ", y_train.shape)
print("y test shape : ", y_test.shape)



df train shape :  (127953,)
df test shape :  (63022,)
y train shape :  (127953,)
y test shape :  (63022,)


In [7]:
# # Building deep learning model 

# import spacy
# nlp = spacy.load("stock_datasets/df_AAPL.csv") # if this fails then run "python -m spacy download en_core_web_lg" to download that model


max_words = 1000
tokenizer=Tokenizer(max_words)
tokenizer.fit_on_texts(df_train)
sequence_train=tokenizer.texts_to_sequences(df_train)
sequence_test=tokenizer.texts_to_sequences(df_test)


In [8]:
word2vec = tokenizer.word_index
V = len(word2vec)
print('dataset has %s number of independent tokens '%V)

data_train = pad_sequences(sequence_train)
data_train.shape

dataset has 38806 number of independent tokens 


(127953, 309)

In [9]:
T = data_train.shape[1]
data_test = pad_sequences(sequence_test, maxlen=T)
data_test.shape

(63022, 309)

In [10]:
D = 20 
i=Input((T,))
x=Embedding(V+1,D)(i)
x=Conv1D(32,3,activation='relu')(x)
x=MaxPooling1D(3)(x)
x=Conv1D(64,3,activation='relu')(x)
x=MaxPooling1D(3)(x)
x=Conv1D(128,3,activation='relu')(x)
x=GlobalMaxPooling1D()(x)
x=Dense(5,activation='softmax')(x)
model=Model(i,x)
model.summary()


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 309)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 309, 20)           776140    
_________________________________________________________________
conv1d (Conv1D)              (None, 307, 32)           1952      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 102, 32)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 100, 64)           6208      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 33, 64)            0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 31, 128)           24704 

In [None]:
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])


data_train = np.asarray(data_train)
y_train = np.asarray(y_train)
data_test = np.asarray(data_test)
y_test = np.asarray(y_test)
cnn_senti=model.fit(data_train,y_train,validation_data=(data_test,y_test),epochs=50,batch_size=100)



Train on 127953 samples, validate on 63022 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50

In [None]:
y_pred=model.predict(data_test)
y_pred


In [None]:
y_pred=np.argmax(y_pred,axis=1)
#y_pred.astype(float)
y_pred[0]

In [None]:
from sklearn.metrics import confusion_matrix,classification_report
import seaborn as sns

In [None]:
y_test

In [None]:
cm=confusion_matrix(y_test,y_pred)
ax=sns.heatmap(cm,annot=True,cmap='Blues',fmt=' ')
ax.set_title('Confusion Matrix')
ax.set_xlabel('y_test')
ax.set_ylabel('y_pred')

In [None]:
print(classification_report(y_test,y_pred))