In [136]:
import pandas as pd

In [137]:
se_df = pd.read_csv('Reddit_data.csv')
se_df

Unnamed: 0,upvotes,created,num_comments,Sentences
0,0,2024-09-27,3,Do you usually close a short position over the...
1,0,2024-09-27,127,"A definitive, verifiable GameStop update There..."
2,8,2024-09-27,42,Why Healthcare stocks not popular around? I am...
3,20,2024-09-27,7,East Coast port strike looms for first time si...
4,0,2024-09-27,100,Why are penny stocks a bad investment? My uncl...
...,...,...,...,...
924,13,2024-07-27,30,/r/Stocks Weekend Discussion Saturday - Jul 27...
925,10,2024-07-27,35,What are your thoughts on stocks in the bankin...
926,0,2024-07-27,4,If you think $IWM will hit at least 265 by Sep...
927,191,2024-07-27,169,Stocks that always look like they’re too expen...


In [138]:
#Sent_tokenize
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

def tokenize_sentences(text):
    sentences = sent_tokenize(text)
    return sentences

se_df['sent_token'] = se_df['Sentences'].apply(tokenize_sentences)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [139]:
#Data clean
import re
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to clean text in an array of sentences (string)
def clean_text_array(sentences): # Changed 'text' to 'sentences'
    cleaned_sentences = []

    for sentence in sentences:
        # Convert to lowercase
        sentence = sentence.lower()
        # Remove punctuation
        sentence = re.sub(r'[^\w\s]', '', sentence)
        # Remove extra spaces
        sentence = re.sub(r'\s+', ' ', sentence).strip()
        cleaned_sentences.append(sentence)

    return cleaned_sentences  # Returns cleaned sentences as a list

# Apply the cleaning function to the column
se_df['cleaned_sent_token'] = se_df['sent_token'].apply(clean_text_array)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [140]:
se_df

Unnamed: 0,upvotes,created,num_comments,Sentences,sent_token,cleaned_sent_token
0,0,2024-09-27,3,Do you usually close a short position over the...,[Do you usually close a short position over th...,[do you usually close a short position over th...
1,0,2024-09-27,127,"A definitive, verifiable GameStop update There...","[A definitive, verifiable GameStop update Ther...",[a definitive verifiable gamestop update there...
2,8,2024-09-27,42,Why Healthcare stocks not popular around? I am...,"[Why Healthcare stocks not popular around?, I ...","[why healthcare stocks not popular around, i a..."
3,20,2024-09-27,7,East Coast port strike looms for first time si...,[East Coast port strike looms for first time s...,[east coast port strike looms for first time s...
4,0,2024-09-27,100,Why are penny stocks a bad investment? My uncl...,"[Why are penny stocks a bad investment?, My un...","[why are penny stocks a bad investment, my unc..."
...,...,...,...,...,...,...
924,13,2024-07-27,30,/r/Stocks Weekend Discussion Saturday - Jul 27...,[/r/Stocks Weekend Discussion Saturday - Jul 2...,[rstocks weekend discussion saturday jul 27 20...
925,10,2024-07-27,35,What are your thoughts on stocks in the bankin...,[What are your thoughts on stocks in the banki...,[what are your thoughts on stocks in the banki...
926,0,2024-07-27,4,If you think $IWM will hit at least 265 by Sep...,[If you think $IWM will hit at least 265 by Se...,[if you think iwm will hit at least 265 by sep...
927,191,2024-07-27,169,Stocks that always look like they’re too expen...,[Stocks that always look like they’re too expe...,[stocks that always look like theyre too expen...


In [141]:
se_df['cleaned_sent_token'][2]

['why healthcare stocks not popular around',
 'i am new to stock market subs',
 'what caught my attention is healthcare stocks are rarely discussed',
 'i am surprised because unlike other industries healthcare companies announce product sales and regional sales numbers healthcare companies clearly state if there is competition or not easier to forecast because both patent expiry date and pipeline are public information what i like about healthcare market at large healthcare is one of three defensive industries according to morningstar',
 'others are utilities and consumer defensive',
 'utilities almost always underperform the market',
 'for consumer defensive there are huge barriers to entry',
 'bluechips in beverages or tobacco are hard to challenge',
 'in bear market i am going to buy utilities and consumer defensive otherwise i chase healthcare for defensive stocks and stability in my portfolio',
 'the reason i go for healthcare stocks is that they possess protected downside with up

## Sentence Embedding

In [142]:
!pip install sentence_transformers



In [143]:
import pandas as pd
import re
from sentence_transformers import SentenceTransformer

# Load a pre-trained SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

def embed_sentences(sentences):

    embeddings = model.encode(sentences)
    return embeddings


se_df['sentence_embeddings'] = se_df['cleaned_sent_token'].apply(embed_sentences)




In [144]:
se_df

Unnamed: 0,upvotes,created,num_comments,Sentences,sent_token,cleaned_sent_token,sentence_embeddings
0,0,2024-09-27,3,Do you usually close a short position over the...,[Do you usually close a short position over th...,[do you usually close a short position over th...,"[[0.043688126, 2.4734585e-05, 0.09048283, 0.01..."
1,0,2024-09-27,127,"A definitive, verifiable GameStop update There...","[A definitive, verifiable GameStop update Ther...",[a definitive verifiable gamestop update there...,"[[-0.06314109, -0.018172346, 0.046303708, -0.0..."
2,8,2024-09-27,42,Why Healthcare stocks not popular around? I am...,"[Why Healthcare stocks not popular around?, I ...","[why healthcare stocks not popular around, i a...","[[0.060288306, -0.021899898, 0.016439298, -0.0..."
3,20,2024-09-27,7,East Coast port strike looms for first time si...,[East Coast port strike looms for first time s...,[east coast port strike looms for first time s...,"[[0.0077211815, -0.00085839053, 0.08215974, -0..."
4,0,2024-09-27,100,Why are penny stocks a bad investment? My uncl...,"[Why are penny stocks a bad investment?, My un...","[why are penny stocks a bad investment, my unc...","[[0.0350865, -0.024686983, 0.022571307, -0.020..."
...,...,...,...,...,...,...,...
924,13,2024-07-27,30,/r/Stocks Weekend Discussion Saturday - Jul 27...,[/r/Stocks Weekend Discussion Saturday - Jul 2...,[rstocks weekend discussion saturday jul 27 20...,"[[-0.109767966, -0.070416145, 0.0069953627, 0...."
925,10,2024-07-27,35,What are your thoughts on stocks in the bankin...,[What are your thoughts on stocks in the banki...,[what are your thoughts on stocks in the banki...,"[[0.036067564, -0.0797995, -0.09842431, -0.015..."
926,0,2024-07-27,4,If you think $IWM will hit at least 265 by Sep...,[If you think $IWM will hit at least 265 by Se...,[if you think iwm will hit at least 265 by sep...,"[[-0.06439496, -0.041993573, -0.10083641, -0.0..."
927,191,2024-07-27,169,Stocks that always look like they’re too expen...,[Stocks that always look like they’re too expe...,[stocks that always look like theyre too expen...,"[[-0.0014376511, -0.0047537945, -0.014052534, ..."


In [145]:
se_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 929 entries, 0 to 928
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   upvotes              929 non-null    int64 
 1   created              929 non-null    object
 2   num_comments         929 non-null    int64 
 3   Sentences            929 non-null    object
 4   sent_token           929 non-null    object
 5   cleaned_sent_token   929 non-null    object
 6   sentence_embeddings  929 non-null    object
dtypes: int64(2), object(5)
memory usage: 50.9+ KB


In [146]:
#converting sentence embedding to float
se_df['sentence_embeddings'] = se_df['sentence_embeddings'].apply(lambda x: x.astype(float))

In [147]:
se_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 929 entries, 0 to 928
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   upvotes              929 non-null    int64 
 1   created              929 non-null    object
 2   num_comments         929 non-null    int64 
 3   Sentences            929 non-null    object
 4   sent_token           929 non-null    object
 5   cleaned_sent_token   929 non-null    object
 6   sentence_embeddings  929 non-null    object
dtypes: int64(2), object(5)
memory usage: 50.9+ KB


In [148]:
se_df['sentence_embeddings'][0]

array([[ 4.36881259e-02,  2.47345852e-05,  9.04828310e-02, ...,
        -7.76946843e-02, -1.49700999e-01,  9.11403634e-03],
       [ 5.76887131e-02,  5.33117466e-02, -1.28759677e-02, ...,
        -1.48387626e-01, -2.87377518e-02,  1.63158681e-02],
       [ 1.35886222e-02,  1.53132156e-02, -2.98344959e-02, ...,
        -4.74364944e-02,  2.84268856e-02,  3.07768621e-02],
       [ 2.17443355e-03,  1.92218721e-02,  8.70793965e-03, ...,
        -1.55109197e-01, -4.46226336e-02,  5.06980866e-02]])

In [149]:
se_df['sentence_embeddings'].dtype

dtype('O')

In [150]:
se_df.drop('sent_token', axis=1, inplace=True)
se_df.drop('cleaned_sent_token', axis=1, inplace=True)

In [151]:
se_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 929 entries, 0 to 928
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   upvotes              929 non-null    int64 
 1   created              929 non-null    object
 2   num_comments         929 non-null    int64 
 3   Sentences            929 non-null    object
 4   sentence_embeddings  929 non-null    object
dtypes: int64(2), object(3)
memory usage: 36.4+ KB


In [152]:
se_df

Unnamed: 0,upvotes,created,num_comments,Sentences,sentence_embeddings
0,0,2024-09-27,3,Do you usually close a short position over the...,"[[0.043688125908374786, 2.4734585167607293e-05..."
1,0,2024-09-27,127,"A definitive, verifiable GameStop update There...","[[-0.06314109265804291, -0.01817234605550766, ..."
2,8,2024-09-27,42,Why Healthcare stocks not popular around? I am...,"[[0.06028830632567406, -0.02189989760518074, 0..."
3,20,2024-09-27,7,East Coast port strike looms for first time si...,"[[0.007721181493252516, -0.0008583905291743577..."
4,0,2024-09-27,100,Why are penny stocks a bad investment? My uncl...,"[[0.0350865013897419, -0.024686982855200768, 0..."
...,...,...,...,...,...
924,13,2024-07-27,30,/r/Stocks Weekend Discussion Saturday - Jul 27...,"[[-0.10976796597242355, -0.07041614502668381, ..."
925,10,2024-07-27,35,What are your thoughts on stocks in the bankin...,"[[0.03606756404042244, -0.07979950308799744, -..."
926,0,2024-07-27,4,If you think $IWM will hit at least 265 by Sep...,"[[-0.06439495831727982, -0.04199357330799103, ..."
927,191,2024-07-27,169,Stocks that always look like they’re too expen...,"[[-0.001437651109881699, -0.004753794521093368..."


In [153]:
import numpy as np
def average_matrices(matrices):
    return np.mean(matrices, axis=0)


se_df['average_embeddings'] = se_df['sentence_embeddings'].apply(average_matrices)

# Display the DataFrame with averages
se_df


Unnamed: 0,upvotes,created,num_comments,Sentences,sentence_embeddings,average_embeddings
0,0,2024-09-27,3,Do you usually close a short position over the...,"[[0.043688125908374786, 2.4734585167607293e-05...","[0.02928497368702665, 0.021967892199427297, 0...."
1,0,2024-09-27,127,"A definitive, verifiable GameStop update There...","[[-0.06314109265804291, -0.01817234605550766, ...","[0.009158641700783083, -0.004708407214146122, ..."
2,8,2024-09-27,42,Why Healthcare stocks not popular around? I am...,"[[0.06028830632567406, -0.02189989760518074, 0...","[0.004602587292902171, -0.014814755879342556, ..."
3,20,2024-09-27,7,East Coast port strike looms for first time si...,"[[0.007721181493252516, -0.0008583905291743577...","[0.01642700267257169, 0.023070555289450567, 0...."
4,0,2024-09-27,100,Why are penny stocks a bad investment? My uncl...,"[[0.0350865013897419, -0.024686982855200768, 0...","[0.015566886713107428, -0.0291584354514877, -0..."
...,...,...,...,...,...,...
924,13,2024-07-27,30,/r/Stocks Weekend Discussion Saturday - Jul 27...,"[[-0.10976796597242355, -0.07041614502668381, ...","[-0.06066796214630207, -0.04605008537570635, -..."
925,10,2024-07-27,35,What are your thoughts on stocks in the bankin...,"[[0.03606756404042244, -0.07979950308799744, -...","[0.013247826447089514, -0.06604315464695294, -..."
926,0,2024-07-27,4,If you think $IWM will hit at least 265 by Sep...,"[[-0.06439495831727982, -0.04199357330799103, ...","[-0.05750201394160589, -0.00573419996847709, -..."
927,191,2024-07-27,169,Stocks that always look like they’re too expen...,"[[-0.001437651109881699, -0.004753794521093368...","[0.01370791473891586, -0.06279189959168434, -0..."


In [154]:
se_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 929 entries, 0 to 928
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   upvotes              929 non-null    int64 
 1   created              929 non-null    object
 2   num_comments         929 non-null    int64 
 3   Sentences            929 non-null    object
 4   sentence_embeddings  929 non-null    object
 5   average_embeddings   929 non-null    object
dtypes: int64(2), object(4)
memory usage: 43.7+ KB


In [155]:
#average embedding to numpy array
se_df['average_embeddings'] = se_df['average_embeddings'].to_numpy()

In [156]:
se_df

Unnamed: 0,upvotes,created,num_comments,Sentences,sentence_embeddings,average_embeddings
0,0,2024-09-27,3,Do you usually close a short position over the...,"[[0.043688125908374786, 2.4734585167607293e-05...","[0.02928497368702665, 0.021967892199427297, 0...."
1,0,2024-09-27,127,"A definitive, verifiable GameStop update There...","[[-0.06314109265804291, -0.01817234605550766, ...","[0.009158641700783083, -0.004708407214146122, ..."
2,8,2024-09-27,42,Why Healthcare stocks not popular around? I am...,"[[0.06028830632567406, -0.02189989760518074, 0...","[0.004602587292902171, -0.014814755879342556, ..."
3,20,2024-09-27,7,East Coast port strike looms for first time si...,"[[0.007721181493252516, -0.0008583905291743577...","[0.01642700267257169, 0.023070555289450567, 0...."
4,0,2024-09-27,100,Why are penny stocks a bad investment? My uncl...,"[[0.0350865013897419, -0.024686982855200768, 0...","[0.015566886713107428, -0.0291584354514877, -0..."
...,...,...,...,...,...,...
924,13,2024-07-27,30,/r/Stocks Weekend Discussion Saturday - Jul 27...,"[[-0.10976796597242355, -0.07041614502668381, ...","[-0.06066796214630207, -0.04605008537570635, -..."
925,10,2024-07-27,35,What are your thoughts on stocks in the bankin...,"[[0.03606756404042244, -0.07979950308799744, -...","[0.013247826447089514, -0.06604315464695294, -..."
926,0,2024-07-27,4,If you think $IWM will hit at least 265 by Sep...,"[[-0.06439495831727982, -0.04199357330799103, ...","[-0.05750201394160589, -0.00573419996847709, -..."
927,191,2024-07-27,169,Stocks that always look like they’re too expen...,"[[-0.001437651109881699, -0.004753794521093368...","[0.01370791473891586, -0.06279189959168434, -0..."


In [157]:
stock_df = pd.read_csv('Stock_Movement.csv')
stock_df

Unnamed: 0,Date,Movement
0,2023-09-01,0
1,2023-09-05,0
2,2023-09-06,0
3,2023-09-07,1
4,2023-09-08,1
...,...,...
264,2024-09-20,1
265,2024-09-23,1
266,2024-09-24,1
267,2024-09-25,1


In [158]:
from datetime import datetime
se_df['created'] = pd.to_datetime(se_df['created'])
stock_df['Date'] = pd.to_datetime(stock_df['Date'])

In [159]:
final_df = pd.merge(se_df, stock_df, left_on='created', right_on='Date', how='inner')

In [160]:
final_df

Unnamed: 0,upvotes,created,num_comments,Sentences,sentence_embeddings,average_embeddings,Date,Movement
0,370,2024-09-26,166,Costco Wholesale misses quarterly revenue esti...,"[[0.029619095847010612, -0.011572643183171749,...","[0.01879738090792671, -0.02343250112608075, -0...",2024-09-26,0
1,11,2024-09-26,14,"How do dividend stocks pay? Hi all, so I'm lo...","[[-0.015942120924592018, -0.016357500106096268...","[-0.022489369134692586, -0.023905693964265725,...",2024-09-26,0
2,3,2024-09-26,6,What happens to my shares after a merger (CHK)...,"[[-0.02195814624428749, -0.0158707145601511, 0...","[-0.014684620406478643, -0.0044510758831165734...",2024-09-26,0
3,0,2024-09-26,14,Buying stocks solely on the basis that they co...,"[[-0.025980478152632713, -0.031116964295506477...","[-0.0588269243016839, -0.06123557919636369, 0....",2024-09-26,0
4,703,2024-09-26,279,Investing in Costco today is actually betting ...,"[[-0.04935167357325554, 0.007634250912815332, ...","[0.003422296441082532, -0.022397756469824042, ...",2024-09-26,0
...,...,...,...,...,...,...,...,...
736,10,2024-07-29,17,Electric grid infrastructure Anyone doing rese...,"[[-0.06172319874167442, 0.0020807073451578617,...","[-0.03779914956539869, -0.00716650215908885, 0...",2024-07-29,1
737,116,2024-07-29,92,Apple to roll out artificial intelligence feat...,"[[0.04105012118816376, -0.05603589490056038, 0...","[-0.018732450623065233, -0.035893273819237945,...",2024-07-29,1
738,24,2024-07-29,24,These are the stocks on my watchlist (7/29) Hi...,"[[-0.06807991862297058, -0.03393291309475899, ...","[-0.030035045319338045, -0.018846726004520187,...",2024-07-29,1
739,5393,2024-07-29,1322,"McDonald's earnings, revenue miss estimates as...","[[-0.008455215021967888, 0.010123365558683872,...","[-0.0005236835309511258, -0.025956832519215014...",2024-07-29,1


In [161]:
X = final_df['average_embeddings']
y = final_df['Movement']

In [162]:
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [163]:
#building a CNN model
import tensorflow as tf
from tensorflow.keras import layers, models

# Define the input shape
input_shape = (384, 1)

# Create the model
model = models.Sequential()

# Add a Conv1D layer with 32 filters, kernel size of 3, and ReLU activation
model.add(layers.Conv1D(32, kernel_size=3, activation='relu', input_shape=input_shape))

# Add a MaxPooling layer to downsample
model.add(layers.MaxPooling1D(pool_size=2))

# Add another Conv1D layer
model.add(layers.Conv1D(64, kernel_size=3, activation='relu'))

# Add a MaxPooling layer
model.add(layers.MaxPooling1D(pool_size=2))

# Flatten the output to feed it into a Dense layer
model.add(layers.Flatten())

# Add a fully connected layer
model.add(layers.Dense(64, activation='relu'))

# Output layer with sigmoid activation for binary classification (1 or 0)
model.add(layers.Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Display the model architecture
model.summary()

# Example of how to train the model (X_train is the input data and y_train is the output label)
# X_train should be of shape (num_samples, 384, 1), and y_train should be the corresponding 1 or 0 labels
# model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [164]:
import tensorflow as tf
# Convert X_train elements to tensors
X_train = tf.convert_to_tensor([tf.convert_to_tensor(arr) for arr in X_train])
#Fit the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 396ms/step - accuracy: 0.7376 - loss: 0.6438 - val_accuracy: 0.7731 - val_loss: 0.5438
Epoch 2/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.7213 - loss: 0.5955 - val_accuracy: 0.7731 - val_loss: 0.5388
Epoch 3/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.7490 - loss: 0.5619 - val_accuracy: 0.7731 - val_loss: 0.5586
Epoch 4/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.7011 - loss: 0.6077 - val_accuracy: 0.7731 - val_loss: 0.5376
Epoch 5/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.7271 - loss: 0.5907 - val_accuracy: 0.7731 - val_loss: 0.5591
Epoch 6/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.7177 - loss: 0.5885 - val_accuracy: 0.7731 - val_loss: 0.5394
Epoch 7/10
[1m15/15[0m [32m━━━

<keras.src.callbacks.history.History at 0x7e468dfe5690>

In [165]:
#train accuracy

loss,accuracy = model.evaluate(X_train, y_train)
print(f"Train Accuracy: {accuracy * 100:.2f}%")

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.7386 - loss: 0.5372
Train Accuracy: 72.97%


In [166]:
import tensorflow as tf

# Convert X_test elements to tensors
X_test = tf.convert_to_tensor([tf.convert_to_tensor(arr) for arr in X_test])

loss, accuracy = model.evaluate(X_test, y_test)

print(f"Test Accuracy: {accuracy * 100:.2f}%")

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 0.7328 - loss: 0.5944
Test Accuracy: 66.44%


## CNN with Sentiment Analysis

In [167]:
!pip install vaderSentiment



In [168]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Convert to lowercase
    text = text.lower()

    # Tokenization (splitting the text into words)
    words = text.split()

    # Remove stop words
    words = [word for word in words if word not in stop_words]
    #Remove Links
    text = re.sub(r'http[s]?://\S+', '', text)

    # Join words back into a single string
    cleaned_text = ' '.join(words)
    return cleaned_text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [169]:
final_df['Cleaned_sentences'] = final_df['Sentences'].apply(clean_text)

In [170]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

def get_compound_score(sentence):
    return analyzer.polarity_scores(sentence)['compound']

final_df['Sentiment'] = final_df['Cleaned_sentences'].apply(get_compound_score)

In [171]:
final_df

Unnamed: 0,upvotes,created,num_comments,Sentences,sentence_embeddings,average_embeddings,Date,Movement,Cleaned_sentences,Sentiment
0,370,2024-09-26,166,Costco Wholesale misses quarterly revenue esti...,"[[0.029619095847010612, -0.011572643183171749,...","[0.01879738090792671, -0.02343250112608075, -0...",2024-09-26,0,costco wholesale misses quarterly revenue esti...,-0.6369
1,11,2024-09-26,14,"How do dividend stocks pay? Hi all, so I'm lo...","[[-0.015942120924592018, -0.016357500106096268...","[-0.022489369134692586, -0.023905693964265725,...",2024-09-26,0,dividend stocks pay hi im looking move cash pa...,0.8442
2,3,2024-09-26,6,What happens to my shares after a merger (CHK)...,"[[-0.02195814624428749, -0.0158707145601511, 0...","[-0.014684620406478643, -0.0044510758831165734...",2024-09-26,0,happens shares merger chk hello relatively new...,0.9572
3,0,2024-09-26,14,Buying stocks solely on the basis that they co...,"[[-0.025980478152632713, -0.031116964295506477...","[-0.0588269243016839, -0.06123557919636369, 0....",2024-09-26,0,buying stocks solely basis could split stock w...,0.8126
4,703,2024-09-26,279,Investing in Costco today is actually betting ...,"[[-0.04935167357325554, 0.007634250912815332, ...","[0.003422296441082532, -0.022397756469824042, ...",2024-09-26,0,investing costco today actually betting costco...,0.9945
...,...,...,...,...,...,...,...,...,...,...
736,10,2024-07-29,17,Electric grid infrastructure Anyone doing rese...,"[[-0.06172319874167442, 0.0020807073451578617,...","[-0.03779914956539869, -0.00716650215908885, 0...",2024-07-29,1,electric grid infrastructure anyone research c...,0.7479
737,116,2024-07-29,92,Apple to roll out artificial intelligence feat...,"[[0.04105012118816376, -0.05603589490056038, 0...","[-0.018732450623065233, -0.035893273819237945,...",2024-07-29,1,apple roll artificial intelligence features oc...,0.9803
738,24,2024-07-29,24,These are the stocks on my watchlist (7/29) Hi...,"[[-0.06807991862297058, -0.03393291309475899, ...","[-0.030035045319338045, -0.018846726004520187,...",2024-07-29,1,stocks watchlist hi exprop shop equity trader ...,-0.7184
739,5393,2024-07-29,1322,"McDonald's earnings, revenue miss estimates as...","[[-0.008455215021967888, 0.010123365558683872,...","[-0.0005236835309511258, -0.025956832519215014...",2024-07-29,1,mcdonalds earnings revenue miss estimates cons...,0.7269


In [172]:
final_df['Sentiment'].min()

-0.9944

In [173]:
final_df['Sentiment'].max()

0.9998

In [174]:
X = final_df[['average_embeddings', 'Sentiment']]
y = final_df['Movement']

In [175]:
#train test split
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [176]:
#Building a CNN model
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, Input

X1_train = np.array(X_train['average_embeddings'])
X2_train = np.array(X_train['Sentiment'])
y_train = np.array(y_train)

X1_test = np.array(X_test['average_embeddings'])
X2_test = np.array(X_test['Sentiment'])
y_test = np.array(y_test)

In [177]:
input_embedding = Input(shape=(384,), name='embedding_input')
input_sentiment = Input(shape=(1,), name='sentiment_input')

In [178]:
# CNN part for average_embeddings
embedding_reshaped = layers.Reshape((384, 1))(input_embedding)  # Reshape to (384, 1) to apply Conv1D
conv1 = layers.Conv1D(filters=32, kernel_size=3, activation='relu')(embedding_reshaped)
pool1 = layers.MaxPooling1D(pool_size=2)(conv1)
conv2 = layers.Conv1D(filters=64, kernel_size=3, activation='relu')(pool1)
pool2 = layers.MaxPooling1D(pool_size=2)(conv2)
flatten_embedding = layers.Flatten()(pool2)

# Dense layer for Sentiment input
dense_sentiment = layers.Dense(16, activation='relu')(input_sentiment)

# Concatenate CNN output and sentiment feature
concat = layers.Concatenate()([flatten_embedding, dense_sentiment])

# Add more dense layers after concatenation
dense1 = layers.Dense(64, activation='relu')(concat)
output = layers.Dense(1, activation='sigmoid')(dense1)  # Output layer for binary classification

# Build and compile the model
model = models.Model(inputs=[input_embedding, input_sentiment], outputs=output)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [179]:
#Building a CNN model
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, Input

# Pad the sequences in 'average_embeddings' to a uniform length
def pad_sequences(sequences, maxlen=None):
  """Pads sequences to the same length.

  Args:
    sequences: A list of sequences (e.g., lists or arrays).
    maxlen: The maximum length to pad to. If None, uses the maximum
            length in the sequences.

  Returns:
    A NumPy array of padded sequences.
  """
  if maxlen is None:
    maxlen = max(len(seq) for seq in sequences)
  padded_sequences = np.zeros((len(sequences), maxlen), dtype=np.float32)
  for i, seq in enumerate(sequences):
    padded_sequences[i, :len(seq)] = seq
  return padded_sequences

X1_train = pad_sequences(X_train['average_embeddings'].to_list()) # Pad sequences and convert to float32
X2_train = np.array(X_train['Sentiment']).astype(np.float32)  # Convert to float32
y_train = np.array(y_train).astype(np.float32) # Convert to float32


X1_test = pad_sequences(X_test['average_embeddings'].to_list()) # Pad sequences and convert to float32
X2_test = np.array(X_test['Sentiment']).astype(np.float32) # Convert to float32
y_test = np.array(y_test).astype(np.float32) # Convert to float32

In [180]:
model.fit([X1_train, X2_train], y_train, epochs=15, batch_size=32, validation_split=0.2)

Epoch 1/15
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 134ms/step - accuracy: 0.6168 - loss: 0.6475 - val_accuracy: 0.7731 - val_loss: 0.5430
Epoch 2/15
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.7408 - loss: 0.5743 - val_accuracy: 0.7731 - val_loss: 0.5436
Epoch 3/15
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.6931 - loss: 0.6197 - val_accuracy: 0.7731 - val_loss: 0.5353
Epoch 4/15
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.7109 - loss: 0.6072 - val_accuracy: 0.7731 - val_loss: 0.5472
Epoch 5/15
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.7111 - loss: 0.5999 - val_accuracy: 0.7731 - val_loss: 0.5372
Epoch 6/15
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.7392 - loss: 0.5678 - val_accuracy: 0.7731 - val_loss: 0.5554
Epoch 7/15
[1m15/15[0m [32m━━━━

<keras.src.callbacks.history.History at 0x7e467219d720>

In [181]:
#accuracy
loss,accuracy = model.evaluate([X1_train, X2_train], y_train)
print(f"Train Accuracy: {accuracy * 100:.2f}%")

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - accuracy: 0.7847 - loss: 0.5085
Train Accuracy: 76.18%


In [182]:
#accuracy of test data
loss,accuracy = model.evaluate([X1_test, X2_test], y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 125ms/step - accuracy: 0.7214 - loss: 0.6291
Test Accuracy: 65.77%
