# Financial News Classifier Using Conv1D with a Trainable Embedding Layer

# Imports

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import TextVectorization
from newsapi import NewsApiClient
from sklearn.preprocessing import LabelEncoder

## Data Preprocessing

In [None]:
def load_financial_phrase_bank(data_dir, encoding='utf-8'):
    sentences = []
    sentiments = []

    # Iterate over all files in the directory
    for filename in os.listdir(data_dir):
        if filename.startswith("Sentences_"):
            filepath = os.path.join(data_dir, filename)
            try:
                with open(filepath, 'r', encoding=encoding) as file:
                    for line in file:
                        line = line.strip()
                        if line:
                            # Split the sentence and sentiment label
                            sentence, sentiment = line.rsplit('@', 1)
                            sentences.append(sentence.strip())
                            sentiments.append(sentiment.strip())
            except UnicodeDecodeError as e:
                print(f"Error decoding {filename} with encoding {encoding}: {e}")
                continue  # Skip files that cause decoding errors

    # Create a DataFrame
    df = pd.DataFrame({
        'sentence': sentences,
        'sentiment': sentiments
    })

    return df

# Usage with different encodings
data_dir = "Your Download Directory/FinancialPhraseBank-v1.0"

# List of encodings to try
encodings = ['utf-8', 'latin1', 'cp1252', 'iso-8859-1']

for enc in encodings:
    print(f"Trying encoding: {enc}")
    df = load_financial_phrase_bank(data_dir, encoding=enc)
    if not df.empty:
        print(f"Successfully loaded data with encoding: {enc}")
        break  # Exit the loop once successful
else:
    print("Failed to load data with tried encodings.")

In [None]:
# Mapping sentiment labels to numerical values
label_mapping = {"positive": 1, "neutral": 0, "negative": -1}
df['sentiment'] = df['sentiment'].map(label_mapping)

# Verify the mapping
print(df.head())


## Train Test Split

In [None]:
RANDOM_SEED = 42
df_train, df_test = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED)

print(f"Training samples: {len(df_train)}, Testing samples: {len(df_test)}")

## Encoding

In [None]:
# Encode the sentiment labels (e.g., -1 -> 0, 0 -> 1, 1 -> 2)
label_encoder = LabelEncoder()
df_train['sentiment'] = label_encoder.fit_transform(df_train['sentiment'])
df_test['sentiment'] = label_encoder.transform(df_test['sentiment'])

# Define the TextVectorization layer
max_features = 20000  # Maximum vocabulary size
sequence_length = 128  # Maximum sequence length

vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length
)

# Adapt the vectorization layer on the training data
vectorize_layer.adapt(df_train['sentence'].values)

# Vectorize the sentences
train_inputs = vectorize_layer(df_train['sentence'].values)
test_inputs = vectorize_layer(df_test['sentence'].values)

train_labels = tf.convert_to_tensor(df_train['sentiment'].values)
test_labels = tf.convert_to_tensor(df_test['sentiment'].values)

In [None]:
# Create TensorFlow datasets from the vectorized inputs and labels
train_dataset = tf.data.Dataset.from_tensor_slices((train_inputs, train_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((test_inputs, test_labels))

# Shuffle, batch, and prefetch the datasets for training
batch_size = 32

train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)


## Model Creation, Compilation and Training

In [None]:
from tensorflow.keras import layers, Model

# Define the model using a trainable embedding layer
class SentimentClassifier(Model):
    def __init__(self, vocab_size, embedding_dim, n_classes):
        super(SentimentClassifier, self).__init__()
        self.embedding = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)
        self.conv = layers.Conv1D(128, 5, activation='relu')
        self.global_pool = layers.GlobalMaxPooling1D()
        self.dropout = layers.Dropout(0.5)
        self.classifier = layers.Dense(n_classes, activation='softmax')

    def call(self, inputs):
        x = self.embedding(inputs)
        x = self.conv(x)
        x = self.global_pool(x)
        x = self.dropout(x)
        return self.classifier(x)

# Initialize the model with a trainable embedding layer
vocab_size = len(vectorize_layer.get_vocabulary())
embedding_dim = 128
n_classes = len(label_encoder.classes_)

classifier_model = SentimentClassifier(vocab_size, embedding_dim, n_classes)

# Compile the model
classifier_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                         loss='sparse_categorical_crossentropy',
                         metrics=['accuracy'])

# Train the model
history = classifier_model.fit(
    train_dataset,
    epochs=20,
    validation_data=test_dataset
)


## Model Evaluation

In [None]:
# Evaluate the model on the test dataset
loss, accuracy = classifier_model.evaluate(test_dataset)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")


In [None]:
from sklearn.metrics import classification_report
import numpy as np

# Predict labels for the test set
y_pred_probs = classifier_model.predict(test_dataset)
y_pred = np.argmax(y_pred_probs, axis=1)

# Get the true labels
y_true = np.concatenate([y for x, y in test_dataset], axis=0)

# Convert integer class labels to strings
target_names = [str(cls) for cls in label_encoder.classes_]

# Print classification report
print(classification_report(y_true, y_pred, target_names=target_names))


## Model Saving

In [None]:
classifier_model.save('sentiment_classifier_model.keras')

## Use the Below Class to Fetch Real Time News and Test Accordingly

In [None]:
class NewsFetcher:
    def __init__(self, api_key):
        self.newsapi = NewsApiClient(api_key="c3cee8f6f03c4788b3b68bc89cdbae42")

    def fetch_latest_news(self, query='stock market'):
        all_articles = self.newsapi.get_everything(q=query,
                                                   language='en',
                                                   sort_by='publishedAt',
                                                   page_size=5)
        headlines = [article['title'] for article in all_articles['articles']]
        return headlines

In [None]:
news_fetcher = NewsFetcher(api_key="your_api_key")
headlines = news_fetcher.fetch_latest_news(query='stock market')

# Initialize the trader
trader = SentimentAnalysisTrader(model=classifier_model, vectorize_layer=vectorize_layer)

# Predict sentiment and decide trade action
for headline in headlines:
    sentiment_score = trader.predict_sentiment([headline])
    action = trader.decide_trade_action(sentiment_score[0])
    print(f"Headline: {headline}\nSentiment: {sentiment_score[0]} -> Action: {action}\n")