In [24]:
import re
import pandas as pd

# Function to load and preprocess data from a file
def load_and_preprocess(file_path, encoding):
    data = []
    with open(file_path, 'r', encoding=encoding) as file:
        for line in file:
            # Splitting the sentence and the sentiment label
            sentence, sentiment = line.rsplit('@', 1)
            # Normalizing text: converting to lowercase and removing special characters
            sentence = re.sub(r'[^a-zA-Z0-9\s]', '', sentence.lower())
            data.append({'sentence': sentence, 'sentiment': sentiment.strip()})
    return pd.DataFrame(data)


file_path_50 = 'Sentences_50Agree.txt'
#file_path_all = 'Sentences_AllAgree.txt'

encoding_50 = 'ISO-8859-1'
#encoding_all = 'ISO-8859-1'

# Loading and preprocessing data from both files
data_50 = load_and_preprocess(file_path_50, encoding_50)
#data_all = load_and_preprocess(file_path_all, encoding_all)


print(data_50.head())
#print(data_all.head())

                                            sentence sentiment
0  according to gran  the company has no plans to...   neutral
1  technopolis plans to develop in stages an area...   neutral
2  the international electronic industry company ...  negative
3  with the new production plant the company woul...  positive
4  according to the company s updated strategy fo...  positive


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Initialize a TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the sentences from data_66
tfidf_matrix = vectorizer.fit_transform(data_50['sentence'])

# Creating a DataFrame for the TF-IDF features
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Adding sentiment information back to the TF-IDF DataFrame
tfidf_df['sentiment'] = data_50['sentiment']

# Displaying the first few rows of the TF-IDF DataFrame
print(tfidf_df.head())

    00  000  000063  0008  001  002  0025  003  0030  004  ...  zinc  \
0  0.0  0.0     0.0   0.0  0.0  0.0   0.0  0.0   0.0  0.0  ...   0.0   
1  0.0  0.0     0.0   0.0  0.0  0.0   0.0  0.0   0.0  0.0  ...   0.0   
2  0.0  0.0     0.0   0.0  0.0  0.0   0.0  0.0   0.0  0.0  ...   0.0   
3  0.0  0.0     0.0   0.0  0.0  0.0   0.0  0.0   0.0  0.0  ...   0.0   
4  0.0  0.0     0.0   0.0  0.0  0.0   0.0  0.0   0.0  0.0  ...   0.0   

   zinclead  zip  zloty  zoltan  zone  zoo  zte   zu  sentiment  
0       0.0  0.0    0.0     0.0   0.0  0.0  0.0  0.0    neutral  
1       0.0  0.0    0.0     0.0   0.0  0.0  0.0  0.0    neutral  
2       0.0  0.0    0.0     0.0   0.0  0.0  0.0  0.0   negative  
3       0.0  0.0    0.0     0.0   0.0  0.0  0.0  0.0   positive  
4       0.0  0.0    0.0     0.0   0.0  0.0  0.0  0.0   positive  

[5 rows x 11215 columns]


In [26]:
X = tfidf_df.drop('sentiment', axis=1)
y = tfidf_df['sentiment']

# Logistic Regression (Baseline)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Assuming tfidf_df is your DataFrame from the previous TF-IDF transformation
# X will contain the TF-IDF features and y will contain the sentiment labels
X = tfidf_df.drop('sentiment', axis=1)
y = tfidf_df['sentiment']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
logreg = LogisticRegression()

# Train the model
logreg.fit(X_train, y_train)

# Predict on the test set
y_pred = logreg.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


              precision    recall  f1-score   support

    negative       0.92      0.45      0.60       110
     neutral       0.73      0.96      0.83       571
    positive       0.80      0.47      0.60       289

    accuracy                           0.76       970
   macro avg       0.82      0.63      0.68       970
weighted avg       0.78      0.76      0.73       970

Accuracy: 0.7556701030927835


# Logistic Regression (with hyperparameter tuning)

In [None]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2'],       # Norm used in the penalization
    'solver': ['liblinear']         # Algorithm to use in the optimization problem
}

# Initialize the Grid Search with cross-validation
grid_search = GridSearchCV(LogisticRegression(class_weight='balanced'), param_grid, cv=5, scoring='f1_macro')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Best parameters found by GridSearchCV
print("Best parameters:", grid_search.best_params_)

# Train the model using the best parameters
best_model = grid_search.best_estimator_

# Predict on the test set
y_pred_best = best_model.predict(X_test)

# Evaluate the best model
print(classification_report(y_test, y_pred_best))




Best parameters: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}
              precision    recall  f1-score   support

    negative       0.74      0.64      0.68       110
     neutral       0.80      0.87      0.83       571
    positive       0.73      0.64      0.68       289

    accuracy                           0.77       970
   macro avg       0.76      0.71      0.73       970
weighted avg       0.77      0.77      0.77       970



# Ensemble Methods: RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

# Train the model
random_forest.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = random_forest.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred_rf))


              precision    recall  f1-score   support

    negative       0.86      0.40      0.55       110
     neutral       0.71      0.98      0.82       571
    positive       0.85      0.39      0.54       289

    accuracy                           0.74       970
   macro avg       0.81      0.59      0.64       970
weighted avg       0.77      0.74      0.71       970



# Ensemble Methods: RandomForest (with hyperparameter tuning)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters and their values for the grid search
param_grid_rf = {
    'n_estimators': [10, 50, 100, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],      # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],      # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4]         # Minimum number of samples required at each leaf node
}

# Initialize the Grid Search model
grid_search_rf = GridSearchCV(RandomForestClassifier(class_weight='balanced', random_state=42), param_grid_rf, cv=5, scoring='f1_macro', verbose=2)

# Fit the grid search to the data
grid_search_rf.fit(X_train, y_train)

# Best parameters found by GridSearchCV
print("Best parameters:", grid_search_rf.best_params_)

# Train the model using the best parameters
best_rf_model = grid_search_rf.best_estimator_

# Predict on the test set using the best model
y_pred_rf_best = best_rf_model.predict(X_test)

# Evaluate the best model
print(classification_report(y_test, y_pred_rf_best))


Fitting 5 folds for each of 144 candidates, totalling 720 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   1.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   1.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   4.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   3.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   3.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   4.5s
[CV] END max_depth=

# Neural Network

In [None]:
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Check if the GPU is available
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("GPU not found. Please select GPU as your runtime type.")

# Encode the target variable
label_encoder = LabelEncoder()
encoded_y = label_encoder.fit_transform(y)  # Replace 'y' with your sentiment labels
# Convert the encoded labels to one-hot-encoding
y_cat = to_categorical(encoded_y)

# Split the data into training and test sets
X_train, X_test, y_train_cat, y_test_cat = train_test_split(X, y_cat, test_size=0.2, random_state=42)  # Replace 'X' with your features

# Convert the DataFrame to a NumPy array
X_train_np = np.array(X_train)
X_test_np = np.array(X_test)

# Define the neural network architecture
model = Sequential()
model.add(Dense(512, input_shape=(X_train_np.shape[1],), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(y_train_cat.shape[1], activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_np, y_train_cat, epochs=10, batch_size=64, verbose=2)

# Evaluate the model on the test data
loss, accuracy = model.evaluate(X_test_np, y_test_cat, verbose=0)
print(f'Test Accuracy: {accuracy:.3f}')


Default GPU Device: /device:GPU:0
Epoch 1/10
53/53 - 7s - loss: 0.8617 - accuracy: 0.6190 - 7s/epoch - 127ms/step
Epoch 2/10
53/53 - 0s - loss: 0.5397 - accuracy: 0.7631 - 223ms/epoch - 4ms/step
Epoch 3/10
53/53 - 0s - loss: 0.2299 - accuracy: 0.9244 - 213ms/epoch - 4ms/step
Epoch 4/10
53/53 - 0s - loss: 0.0744 - accuracy: 0.9810 - 207ms/epoch - 4ms/step
Epoch 5/10
53/53 - 0s - loss: 0.0294 - accuracy: 0.9932 - 211ms/epoch - 4ms/step
Epoch 6/10
53/53 - 0s - loss: 0.0127 - accuracy: 0.9964 - 210ms/epoch - 4ms/step
Epoch 7/10
53/53 - 0s - loss: 0.0080 - accuracy: 0.9991 - 209ms/epoch - 4ms/step
Epoch 8/10
53/53 - 0s - loss: 0.0056 - accuracy: 0.9988 - 204ms/epoch - 4ms/step
Epoch 9/10
53/53 - 0s - loss: 0.0057 - accuracy: 0.9985 - 202ms/epoch - 4ms/step
Epoch 10/10
53/53 - 0s - loss: 0.0030 - accuracy: 0.9991 - 202ms/epoch - 4ms/step
Test Accuracy: 0.813


# NLP Approach - Using LLMs: BERT Model

In [None]:
pip install transformers



In [32]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Assuming 'data' is your DataFrame with 'sentence' and 'sentiment' columns
# Splitting the dataset into training and testing sets
train_df, test_df = train_test_split(data_50, test_size=0.2, random_state=42)

# Load BERT tokenizer and BERT model
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Function to convert data to InputExamples
def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN):
    train_InputExamples = train.apply(lambda x: InputExample(guid=None,
                                                            text_a = x[DATA_COLUMN],
                                                            text_b = None,
                                                            label = x[LABEL_COLUMN]), axis = 1)
    validation_InputExamples = test.apply(lambda x: InputExample(guid=None,
                                                                text_a = x[DATA_COLUMN],
                                                                text_b = None,
                                                                label = x[LABEL_COLUMN]), axis = 1)
    return train_InputExamples, validation_InputExamples

# Function to convert InputExamples to TF Dataset
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = []

    for e in examples:
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length,
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True,
            truncation=True
        )
        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict["attention_mask"])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )

# Encode label column
label_encoder = LabelEncoder()
train_df['sentiment'] = label_encoder.fit_transform(train_df['sentiment'])
test_df['sentiment'] = label_encoder.transform(test_df['sentiment'])

# Preprocess and convert data
DATA_COLUMN = 'sentence'
LABEL_COLUMN = 'sentiment'
train_InputExamples, validation_InputExamples = convert_data_to_examples(train_df, test_df, DATA_COLUMN, LABEL_COLUMN)
train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)

# Compile the BERT model
bert_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08),
                   loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                   metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

# Train the BERT model
bert_model.fit(train_data, epochs=2, validation_data=validation_data)


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7aa21b9fa020>

In [33]:
# Save the model's weights
bert_saved_model = 'bert_model'
bert_model.save_weights(bert_saved_model)


# Run model to make predictions on new observations (using BERT)

In [34]:
import pandas as pd
import json
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

print("Starting the sentiment analysis script...")

# Load the trained BERT model
model_name = 'bert-base-uncased'
bert_model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=3)
print("Loading the trained BERT model...")
bert_model.load_weights('bert_model')  # Use the correct folder name where your model's weights are saved

# Load data from the output file
print("Loading data from the JSON file...")
with open('top_18_stocks.json', 'r') as file:  # Adjust the path if necessary
    data = json.load(file)
df = pd.DataFrame(data)

# Initialize BERT tokenizer
print("Initializing the BERT tokenizer...")
tokenizer = BertTokenizer.from_pretrained(model_name)

# Function to preprocess and predict sentiment for each summary
def predict_sentiment(texts, tokenizer, model, max_length=128):
    predictions = []
    print("Starting sentiment prediction for each text...")

    for i, text in enumerate(texts):
        print(f"Processing text {i+1}/{len(texts)}")
        inputs = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            return_tensors="tf",
            truncation=True
        )
        outputs = model(inputs)
        probs = tf.nn.softmax(outputs.logits, axis=-1)
        predicted_label = tf.argmax(probs, axis=-1).numpy()[0]
        predictions.append(predicted_label)

    print("Finished predicting sentiment for all texts.")
    return predictions

# Predict sentiment for each summary in the DataFrame
print("Running predictions on summaries...")
sentiments = predict_sentiment(df['summary'].tolist(), tokenizer, bert_model)

# Map the numeric predictions back to sentiment labels
label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
df['predicted_sentiment'] = [label_map[label] for label in sentiments]

# Save the results to a new JSON file
output_file_path = 'enhanced_top_18_stocks.json'  # Adjust the path if necessary
print(f"Saving the predicted sentiments to {output_file_path}...")
df.to_json(output_file_path, orient='records', lines=True)

print("All processes complete. Data with sentiment predictions saved.")


Starting the sentiment analysis script...


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading the trained BERT model...
Loading data from the JSON file...
Initializing the BERT tokenizer...
Running predictions on summaries...
Starting sentiment prediction for each text...
Processing text 1/850
Processing text 2/850
Processing text 3/850
Processing text 4/850
Processing text 5/850
Processing text 6/850
Processing text 7/850
Processing text 8/850
Processing text 9/850
Processing text 10/850
Processing text 11/850
Processing text 12/850
Processing text 13/850
Processing text 14/850
Processing text 15/850
Processing text 16/850
Processing text 17/850
Processing text 18/850
Processing text 19/850
Processing text 20/850
Processing text 21/850
Processing text 22/850
Processing text 23/850
Processing text 24/850
Processing text 25/850
Processing text 26/850
Processing text 27/850
Processing text 28/850
Processing text 29/850
Processing text 30/850
Processing text 31/850
Processing text 32/850
Processing text 33/850
Processing text 34/850
Processing text 35/850
Processing text 3

In [36]:
import pandas as pd

json_file_path = 'enhanced_top_18_stocks.json'  # Replace with your JSON file path
data = pd.read_json(json_file_path, lines=True)

csv_file_path = 'enhanced_top_18_stocks.csv'  # Replace with your desired CSV file path
data.to_csv(csv_file_path, index=False)

